In [1]:
import os
import shutil
import glob
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from itertools import permutations
from multiprocessing import Pool

In [19]:
m6a_dir = "/data5/galaxy/project/methyl_m6a/data/diff_m6a_peak/macs2_bdgdiff/combination/"
subdir_list = ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
for sub_dir in subdir_list:
    print(sub_dir)
    df_methyl_up, df_methyl_down = class_methyl_run(sub_dir)
    df_m6a_up, df_m6a_down = class_m6a_by_diffPeak(os.path.join(m6a_dir, sub_dir))
    df_methylup_m6a_down = df_methyl_up.merge(df_m6a_down, on="name", how="right").dropna()
    ratio_1 = len(df_methylup_m6a_down) / len(df_methyl_up)
    #
    df_methyldown_m6a_up = df_methyl_down.merge(df_m6a_up, on="name", how="right").dropna()
    ratio_2 = len(df_methyldown_m6a_up) / len(df_methyl_down)
    print(ratio_1, ratio_2)

heart_vs_brain
0.21523865994663505 0.49022869022869026
kid_vs_brain
0.25338753387533874 0.5491580662683324
lung_vs_brain
0.16970998925886144 0.5576227390180879
heart_vs_kid
0.27649208282582216 0.339367904265112
lung_vs_heart
0.19715224534501644 0.3877344145970603
lung_vs_kid
0.11399491094147583 0.39775606225117627


In [18]:
def class_methyl_run(sub_dir):
    df_methyl = foldchange_of_methylation(sub_dir)
    df_up, df_down = class_methyl(df_methyl)
    return df_up.drop_duplicates(), df_down.drop_duplicates()
# 81--brain; 83--heart; 86--kid; 88--lung;
# ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
def foldchange_of_methylation(sub_dir):
    os.chdir("/data5/galaxy/project/methyl_m6a/data/roimethstat/")
    map_dict = {"brain": "E081", "heart": "E083", "kid": "E086", "lung": "E088"}
    tissue_1, tissue_2 = sub_dir.split("_")[0], sub_dir.split("_")[-1]
    try:
        num_1, num_2 = map_dict[tissue_1], map_dict[tissue_2]
    except KeyError:
        print("%s %s didn't have acoording DNA methylation data!" % (tissue_1, tissue_2))
    df_1 = pd.read_table("promoter_%s.bed" % num_1, header=None, names=["chr", "s", "e", "n", tissue_1, "strand"])
    df_1["name"] = df_1["n"].str.split(":").str[0]
    df_2 = pd.read_table("promoter_%s.bed" % num_2, header=None, names=["chr", "s", "e", "n", tissue_2, "strand"])
    df_2["name"] = df_2["n"].str.split(":").str[0]
    df_methyl = df_1.merge(df_2, on="name").dropna()
    df_methyl["logFC"] = np.log2(df_methyl[tissue_1].astype(float) + 0.01) - np.log2(df_methyl[tissue_2].astype(float) + 0.01)
    df_methyl = df_methyl[["name", "logFC"]]
#     print(df_1[tissue_1].mean(), df_2[tissue_2].mean())
    return df_methyl

def class_methyl(df):
    df_up = df[(df["logFC"] > 0) & (abs(df["logFC"]) > 0.585)]
    df_down = df[(df["logFC"] < 0) & (abs(df["logFC"]) > 0.585)]
#     print(df.head(), len(df))
    return df_up, df_down

In [8]:
def class_m6a_by_diffPeak(work_dir):
    os.chdir(work_dir)
    m6a_up = pd.read_table("diff_peak_c3.0_cond1_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    m6a_down = pd.read_table("diff_peak_c3.0_cond2_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    df_m6a_up = m6a_up.groupby(["name"]).mean().reset_index()
    df_m6a_down = m6a_down.groupby(["name"]).mean().reset_index()
    return df_m6a_up.drop_duplicates(), df_m6a_down.drop_duplicates()

In [4]:
def boxplot(df_up_coor, df_up_unchange, df_down_coor, df_down_unchange, sub_dir):
    df_up_down = pd.DataFrame({"up": df_up_coor["logFC"], "up_unchanged": df_up_unchange["logFC"], 
                               "down": df_down_coor["logFC"], "down_unchanged": df_down_unchange["logFC"]})
    g = sns.boxplot(data=df_up_down, order=["down", "down_unchanged", "up", "up_unchanged"], saturation=0.5, showfliers=False)
    g.set_ylabel("log2fc of methylation level", size = 14, color="black", alpha=1)
    plt.title(sub_dir)
    plt.savefig("/data5/galaxy/project/methyl_m6a/analysis_result_3/%s.pdf" % sub_dir)
    plt.close()
    print(len(df_up_down["down"].dropna()), len(df_up_down["down_unchanged"].dropna()), len(df_up_down["up"].dropna()), len(df_up_down["up_unchanged"].dropna()))
    # return df_up_down