#### proprocess data

In [1]:
import os
import sys
import shutil
import glob
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from itertools import permutations
from multiprocessing import Pool

In [110]:
expre_file = "/data5/galaxy/project/expression/stringtie/Total-RPKM.txt"
m6a_dir = "/data5/galaxy/project/methyl_m6a/data/diff_m6a_peak/macs2_bdgdiff/combination/"
expre_dir = "/data5/galaxy/project/expression/DESeq2/"
# methylation level only have these tissue data.
subdir_list = ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
names = get_genes_by_CpG()
for sub_dir in subdir_list:
    print("##################################\n%s" % sub_dir)
    df_m6a_up, df_m6a_common, df_m6a_down = class_m6a_by_diffPeak(os.path.join(m6a_dir, sub_dir))
    df_m6a_up, df_m6a_down, df_m6a_common = filter_intersect(df_m6a_up, df_m6a_down, df_m6a_common)
    df_m6a = pd.concat([df_m6a_up, df_m6a_common, df_m6a_down]).dropna()
    ###
    print("############ expression ##########")
    df_gene_up, df_gene_down = class_gene_by_diffexpre(os.path.join(expre_dir, "%s.csv" % sub_dir))
    df_gene_up = df_gene_up.merge(df_m6a, on="name", how="left").dropna()
    df_gene_down = df_gene_down.merge(df_m6a, on="name", how="left").dropna()
#     print(len(df_gene_up.merge(df_gene_down, on="name", how="left").dropna()))
#     print(df_gene_up.head())
    statistic_proportion(df_m6a_up, df_m6a_common, df_m6a_down, df_gene_up, df_gene_down)

##################################
heart_vs_brain
5245 4832
1844 1475
2030 1908
########### methylation ##########
up
m6a up	m6a common	m6a down
0.609626	0.224599	0.165775
down
m6a up	m6a common	m6a down
0.615658	0.224199	0.160142
##################################
kid_vs_brain
5050 4321
1486 1222
2444 2318
########### methylation ##########
up
m6a up	m6a common	m6a down
0.471483	0.387833	0.140684
down
m6a up	m6a common	m6a down
0.570423	0.285211	0.144366
##################################
lung_vs_brain
6335 5680
1346 1039
1587 1504
########### methylation ##########
up
m6a up	m6a common	m6a down
0.652246	0.229617	0.118136
down
m6a up	m6a common	m6a down
0.665370	0.202335	0.132296
##################################
heart_vs_kid
2411 2247
2546 2107
3660 3484
########### methylation ##########
up
m6a up	m6a common	m6a down
0.332308	0.427692	0.240000
down
m6a up	m6a common	m6a down
0.367164	0.435821	0.197015
##################################
lung_vs_heart
3276 2781
1408 1245
4006 3799
##

In [109]:
def get_genes_by_CpG():
    cutoff = 0.35
    CpG_high = "/data5/galaxy/project/CpG_m6a_motif/fasta_seq/high_CpG.bed"
    for in_bed in [CpG_high]:
        df = pd.read_table(in_bed, sep="\t", header=None)
#         df_subset = df[df.iloc[:, 4] > cutoff]
        names = set(df.iloc[:, 3])
    return names

In [87]:
def statistic_proportion(df_m6a_up, df_m6a_common, df_m6a_down, df_gene_up, df_gene_down):
    print("up")
    each_type_expre(df_gene_up)
    print("down")
    each_type_expre(df_gene_down)
    
def each_type_expre(df):
    total = len(df["name"].drop_duplicates())
    m6a_up = len(df[df["name"].isin(df_m6a_up["name"])]["name"].drop_duplicates())
    m6a_common = len(df[df["name"].isin(df_m6a_common["name"])]["name"].drop_duplicates())
    m6a_down = len(df[df["name"].isin(df_m6a_down["name"])]["name"].drop_duplicates())
    up_prop, common_prop, down_prop = (m6a_up / total), (m6a_common / total), (m6a_down/ total)
    print("m6a up\tm6a common\tm6a down")
    print("%f\t%f\t%f" % (up_prop, common_prop, down_prop))
#     a = set(m6a_up) & set(m6a_common)
#     b = set(m6a_up) & set(m6a_down)
#     c = set(m6a_common) & set(m6a_down)
#     print(len(a), len(b), len(c))
#     sys.exit(0)
#     return up_prop, common_prop, down_prop

In [43]:
# 81--brain; 83--heart; 86--kid; 88--lung;
# ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
def foldchange_of_methylation(sub_dir):
    os.chdir("/data5/galaxy/project/methyl_m6a/data/roimethstat/")
    map_dict = {"brain": "E081", "heart": "E083", "kid": "E086", "lung": "E088"}
    tissue_1, tissue_2 = sub_dir.split("_")[0], sub_dir.split("_")[-1]
    try:
        num_1, num_2 = map_dict[tissue_1], map_dict[tissue_2]
    except KeyError:
        print("%s %s didn't have acoording DNA methylation data!" % (tissue_1, tissue_2))
    df_1 = pd.read_table("promoter_filtered_%s.bed" % num_1, header=None, names=["chr", "s", "e", "n", tissue_1, "strand"])
    # filtered_
    df_1["name"] = df_1["n"].str.split(":").str[0]
    df_2 = pd.read_table("promoter_filtered_%s.bed" % num_2, header=None, names=["chr", "s", "e", "n", tissue_2, "strand"])
    df_2["name"] = df_2["n"].str.split(":").str[0]
    df_methyl = df_1.merge(df_2, on="name").dropna()
    df_methyl["logFC"] = np.log2(df_methyl[tissue_1].astype(float) + 0.0001) - np.log2(df_methyl[tissue_2].astype(float) + 0.0001)
    df_methyl = df_methyl[["name", "logFC"]].sort_values(["logFC"], ascending=False)
#     print(df_methyl.head())
#     print(df_1[tissue_1].median(), df_2[tissue_2].median())
    df_methyl_up = df_methyl[df_methyl["logFC"] > 1]
#     df_methyl_common = df_methyl[ -1 < df_methyl["logFC"] < 1]
    df_methyl_down = df_methyl[df_methyl["logFC"] < -1]
    return df_methyl_up, df_methyl_down

In [105]:
# def class_and_merge(work_dir, df_methylation):
#     df_m6a_up, df_m6a_down = class_m6a_by_diffPeak(work_dir)
#     df_up, df_down, df_remain = merge_m6a_and_methyl(df_m6a_up, df_m6a_down, df_methylation)
#     return df_up, df_down, df_remain


def class_m6a_by_diffPeak(work_dir):
    os.chdir(work_dir)
    df_total_exp = get_expre()
    m6a_up = pd.read_table("diff_peak_c3.0_cond1_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    m6a_common = pd.read_table("diff_peak_c3.0_common_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    m6a_down = pd.read_table("diff_peak_c3.0_cond2_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    # m6a score 计算方式
    df_m6a_up = m6a_up.groupby(["name"]).mean().reset_index()
    df_m6a_common = m6a_common.groupby(["name"]).mean().reset_index()
    df_m6a_down = m6a_down.groupby(["name"]).mean().reset_index()
    df_m6a_up, df_m6a_down, df_m6a_common = filter_intersect(df_m6a_up, df_m6a_down, df_m6a_common)
    # filter by expression
    sub_dir = work_dir.split("/")[-1]
    tissue_1, tissue_2 = sub_dir.split("_")[0], sub_dir.split("_")[-1]
    df_m6a_up = filter_by_exp(df_total_exp, df_m6a_up, tissue_1, tissue_2)
    df_m6a_down = filter_by_exp(df_total_exp, df_m6a_down, tissue_1, tissue_2)
    df_m6a_common = filter_by_exp(df_total_exp, df_m6a_common, tissue_1, tissue_2)
    return df_m6a_up, df_m6a_common, df_m6a_down

def filter_by_exp(df_total_exp, df, tissue_1, tissue_2):
    df_expre_1 = df_total_exp[["name", tissue_1]].set_index("name")
    df_expre_2 = df_total_exp[["name", tissue_2]].set_index("name")
    names = df.name
    clean_names = []
    for name in names:
        if (name in df_expre_1.index) and (name in df_expre_2.index):
#         print(df_expre_1.head())
            if (df_expre_1.loc[name, tissue_1] != 0.0) & (df_expre_2.loc[name, tissue_2] != 0.0):
                clean_names.append(name)
    df_clean = df[df["name"].isin(clean_names)]
    print(len(df), len(df_clean))
    return df_clean

def get_expre():
    df = pd.read_table(expre_file, sep="\t")
    df.columns = ["gene", "brain", "heart", "kid", "liver", "pla", "lung", "mus", "sto"]
    df["name"] = df["gene"].str.split(".").str[0]
    return df
    
    
def filter_intersect(df_up, df_down, df_remain):
#     print("before", len(df_up), len(df_down), len(df_remain))
    up_genes, down_genes, remain_genes = set(df_up["name"]), set(df_down["name"]), set(df_remain["name"])
    uniq_up = up_genes - down_genes - remain_genes
    uniq_down = down_genes - up_genes - remain_genes
    uniq_remain = remain_genes - up_genes - down_genes
    df_up, df_down, df_remain = select_uniq(uniq_up, df_up), select_uniq(uniq_down, df_down), select_uniq(uniq_remain, df_remain)
#     print("after", len(uniq_up), len(uniq_down), len(uniq_remain))
#     print("after", len(df_up), len(df_down), len(df_remain))
    return df_up, df_down, df_remain



def merge_m6a_and_methyl(df_m6a, df_methyl_up, df_methyl_down):
    df_up = df_m6a.merge(df_methyl_up, on="name", how="right").dropna().drop_duplicates()
#     df_common = df_m6a.merge(df_methyl_common, on="name", how="right").dropna().drop_duplicates()
    df_down = df_m6a.merge(df_methyl_down, on="name", how="right").dropna().drop_duplicates()
    return df_up, df_down

In [82]:
def format_gene_name(df):
    df["name"] = df["Unnamed: 0"].str.split(".").str[0]
#     df = df.set_index(["name"])
    del df["Unnamed: 0"]
    return df

def class_gene_by_diffexpre(DESeq2_file):
    df = pd.read_csv(DESeq2_file)
    df = df[(abs(df.log2FoldChange) > 1) & (df.padj < 0.05)]
    df_up, df_down = df.copy()[df.log2FoldChange > 0], df.copy()[df.log2FoldChange < 0]
#     print(len(df_up), len(df_down))
    return format_gene_name(df_up), format_gene_name(df_down)

In [10]:
## 2.we find the Spearman rho between a row of expression data and each row of methylation data, and we keep the row of methylation data that produces the most-negative rho value
def filter_each_row_by_expression(df, df_gene_up, df_gene_down):
#     print(len(df))
    df_pos, df_neg = df[df.logFC >0], df[df.logFC <0]
#     print(len(df_pos), len(df_neg))
#     print(df_pos.head())
#     print(df_gene_up.head())
    up_genes, down_genes, total_genes = df_gene_up.name, df_gene_down.name, list(set(df["name"].tolist()))
#     remain_genes = [x for x in total_genes if (x not in up_genes) and (x not in down_genes)]
    df_pos = df_pos[df_pos["name"].isin(down_genes)]
    df_neg = df_neg[df_neg["name"].isin(up_genes)]
#     print(len(df_pos), len(df_neg))
    df_coordinate = pd.concat([df_pos, df_neg]).dropna()
#     df_unchange = df[df["name"].isin(remain_genes)]
#     print(len(df), len(df_coordinate), len(df_unchange))
    return df_coordinate

In [7]:
def intersect_m6a_expre_methyl(df_type, df_list, df_methyl):
    df = df_list[0].merge(df_list[1], on="name", how="left").merge(df_methyl, on="name", how="left").dropna()
#     if df_type == "up":
#         df_coor = df[(df["logFC"] < 0) & (abs(df["logFC"]) > 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#         df_unchange = df[(df["logFC"] < 0) & (abs(df["logFC"]) < 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#     elif df_type == "down":
#         df_coor = df[(df["logFC"] > 0) & (abs(df["logFC"]) > 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#         df_unchange = df[(df["logFC"] > 0) & (abs(df["logFC"]) < 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#     else:
#         print("%s is not identified!" % df_type)
#     print(df.head(), len(df))
#     return df_coor, df_unchange
    return df

In [89]:
def filter_intersect(df_up, df_down, df_remain):
#     print("before", len(df_up), len(df_down), len(df_remain))
    up_genes, down_genes, remain_genes = set(df_up["name"]), set(df_down["name"]), set(df_remain["name"])
    uniq_up = up_genes - down_genes - remain_genes
    uniq_down = down_genes - up_genes - remain_genes
    uniq_remain = remain_genes - up_genes - down_genes
    df_up, df_down, df_remain = select_uniq(uniq_up, df_up), select_uniq(uniq_down, df_down), select_uniq(uniq_remain, df_remain)
#     print("after", len(uniq_up), len(uniq_down), len(uniq_remain))
#     print("after", len(df_up), len(df_down), len(df_remain))
    return df_up, df_down, df_remain
    
def select_uniq(query_list, df):
    df = df[df["name"].isin(query_list)]
    return df

In [46]:
def boxplot(df_up, df_down, df_remain, sub_dir):
#     df_up_down = pd.DataFrame({"up": df_up_coor["logFC"], "up_unchanged": df_up_unchange["logFC"], 
#                                "down": df_down_coor["logFC"], "down_unchanged": df_down_unchange["logFC"]})
#     g = sns.boxplot(data=df_up_down, order=["down", "down_unchanged", "up", "up_unchanged"], saturation=0.5, showfliers=False)
    df_up_down = pd.DataFrame({"up": df_up["logFC"], "down": df_down["logFC"], "remain": df_remain["logFC"]})
    g = sns.boxplot(data=df_up_down, order=["down", "up", "remain"], saturation=0.5, showfliers=False)
    g.set_ylabel("log2fc of methylation level", size = 14, color="black", alpha=1)
    plt.title(sub_dir)
    plt.savefig("/data5/galaxy/project/methyl_m6a/analysis_result_6/%s.pdf" % sub_dir)
    plt.close()
    print(len(df_up_down["down"].dropna()), len(df_up_down["up"].dropna()), len(df_up_down["remain"].dropna()))
    print(df_up_down["down"].median(), df_up_down["up"].median(), df_up_down["remain"].median())
    # return df_up_down

In [2]:
# def prepare_plot_data(sub_dir, df_up, df_down, df_remain):
#     result_file = "/data5/galaxy/project/methyl_m6a/data/plot_data/%s.txt" % sub_dir
#     df_up["m6a_type"] = "up"
#     df_down["m6a_type"] = "down"
#     df_remain["m6a_type"] = "remain"
#     df = pd.concat([df_up, df_down, df_remain], sort=True)
#     df.to_csv(result_file, sep="\t", index=False)