#### proprocess data

In [6]:
import os
import sys
import shutil
import glob
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from itertools import permutations
from multiprocessing import Pool

In [28]:
def class_gene_by_diffexpre(DESeq2_file):
    df = pd.read_csv(DESeq2_file)
#     df = df[(abs(df.log2FoldChange) > 1) & (df.padj < 0.05)]
#     df_up, df_down = df.copy()[df.log2FoldChange > 0], df.copy()[df.log2FoldChange < 0]
#     return format_gene_name(df_up), format_gene_name(df_down)
    return format_gene_name(df)

def format_gene_name(df):
    df["name"] = df["Unnamed: 0"].str.split(".").str[0]
    del df["Unnamed: 0"]
    return df

In [29]:
expre_dir = "/data5/galaxy/project/expression/DESeq2/"

subdir_list = ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
for sub_dir in subdir_list:
    print(sub_dir)
    df = class_gene_by_diffexpre(os.path.join(expre_dir, "%s.csv" % sub_dir))
    query_list = ["ENSG00000136689", "ENSG00000129152"]
    df_res = df[df["name"].isin(query_list)]
    print(df_res)
            

heart_vs_brain
Empty DataFrame
Columns: [baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, name]
Index: []
kid_vs_brain
       baseMean  log2FoldChange     lfcSE      stat    pvalue      padj  \
9048  63.106825        3.596844  0.941578  3.820019  0.000133  0.000474   

                 name  
9048  ENSG00000136689  
lung_vs_brain
       baseMean  log2FoldChange     lfcSE      stat   pvalue      padj  \
8826  63.106825        3.956212  0.994914  3.976437  0.00007  0.000255   

                 name  
8826  ENSG00000136689  
heart_vs_kid
Empty DataFrame
Columns: [baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, name]
Index: []
lung_vs_heart
Empty DataFrame
Columns: [baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, name]
Index: []
lung_vs_kid
Empty DataFrame
Columns: [baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, name]
Index: []


In [11]:
m6a_dir = "/data5/galaxy/project/methyl_m6a/data/diff_m6a_peak/macs2_bdgdiff/combination/"
expre_dir = "/data5/galaxy/project/expression/DESeq2/"
# methylation level only have these tissue data.
subdir_list = ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
for sub_dir in subdir_list:
    print("##################################\n%s" % sub_dir)
    df_m6a_up, df_m6a_common, df_m6a_down = class_m6a_by_diffPeak(os.path.join(m6a_dir, sub_dir))
    df_m6a_up, df_m6a_down, df_m6a_common = filter_intersect(df_m6a_up, df_m6a_down, df_m6a_common)
    df_m6a = pd.concat([df_m6a_up, df_m6a_common, df_m6a_down]).dropna()
    #
    df_gene_up, df_gene_down = class_gene_by_diffexpre(os.path.join(expre_dir, "%s.csv" % sub_dir))
    df_gene_up = df_gene_up.merge(df_m6a, on="name", how="left").dropna()
    df_gene_down = df_gene_down.merge(df_m6a, on="name", how="left").dropna()
    #
    df_methyl_up, df_methyl_down = foldchange_of_methylation(sub_dir)
    df_up, df_down = merge_m6a_and_methyl(df_m6a, df_methyl_up, df_methyl_down)
###
    print("############ methylation without expression ##########")
    statistic_proportion(df_m6a_up, df_m6a_common, df_m6a_down, df_up, df_down)
###
    print("########### methylation with expression ##############")
    df_up = filter_each_row_by_expression(df_up, df_gene_up, df_gene_down)
    df_down = filter_each_row_by_expression(df_down, df_gene_up, df_gene_down)
    statistic_proportion(df_m6a_up, df_m6a_common, df_m6a_down, df_up, df_down)

##################################
heart_vs_brain
############ methylation without expression ##########
up
m6a up	m6a common	m6a down
0.615727	0.212166	0.172107
down
m6a up	m6a common	m6a down
0.643243	0.194595	0.162162
########### methylation with expression ##############
up
m6a up	m6a common	m6a down
0.085271	0.279070	0.635659
down
m6a up	m6a common	m6a down
0.946154	0.038462	0.015385
##################################
kid_vs_brain
############ methylation without expression ##########
up
m6a up	m6a common	m6a down
0.515055	0.345483	0.139461
down
m6a up	m6a common	m6a down
0.607895	0.247368	0.144737
########### methylation with expression ##############
up
m6a up	m6a common	m6a down
0.021739	0.195652	0.782609
down
m6a up	m6a common	m6a down
0.992248	0.007752	0.000000
##################################
lung_vs_brain
############ methylation without expression ##########
up
m6a up	m6a common	m6a down
0.659058	0.206847	0.134094
down
m6a up	m6a common	m6a down
0.700297	0.166172	0.13353

In [12]:
# 81--brain; 83--heart; 86--kid; 88--lung;
# ["heart_vs_brain", "kid_vs_brain", "lung_vs_brain", "heart_vs_kid", "lung_vs_heart", "lung_vs_kid"]
def foldchange_of_methylation(sub_dir):
    os.chdir("/data5/galaxy/project/methyl_m6a/data/roimethstat/")
    map_dict = {"brain": "E081", "heart": "E083", "kid": "E086", "lung": "E088"}
    tissue_1, tissue_2 = sub_dir.split("_")[0], sub_dir.split("_")[-1]
    try:
        num_1, num_2 = map_dict[tissue_1], map_dict[tissue_2]
    except KeyError:
        print("%s %s didn't have acoording DNA methylation data!" % (tissue_1, tissue_2))
    df_1 = pd.read_table("promoter_filtered_%s.bed" % num_1, header=None, names=["chr", "s", "e", "n", tissue_1, "strand"])
    # filtered_
    df_1["name"] = df_1["n"].str.split(":").str[0]
    df_2 = pd.read_table("promoter_filtered_%s.bed" % num_2, header=None, names=["chr", "s", "e", "n", tissue_2, "strand"])
    df_2["name"] = df_2["n"].str.split(":").str[0]
    df_methyl = df_1.merge(df_2, on="name").dropna()
    df_methyl["logFC"] = np.log2(df_methyl[tissue_1].astype(float) + 0.0001) - np.log2(df_methyl[tissue_2].astype(float) + 0.0001)
    df_methyl = df_methyl[["name", "logFC"]].sort_values(["logFC"], ascending=False)
#     print(df_methyl.head())
#     print(df_1[tissue_1].median(), df_2[tissue_2].median())
    df_methyl_up = df_methyl[df_methyl["logFC"] > 1]
#     df_methyl_common = df_methyl[ -1 < df_methyl["logFC"] < 1]
    df_methyl_down = df_methyl[df_methyl["logFC"] < -1]
    return df_methyl_up, df_methyl_down

In [11]:
# def class_and_merge(work_dir, df_methylation):
#     df_m6a_up, df_m6a_down = class_m6a_by_diffPeak(work_dir)
#     df_up, df_down, df_remain = merge_m6a_and_methyl(df_m6a_up, df_m6a_down, df_methylation)
#     return df_up, df_down, df_remain
    
def class_m6a_by_diffPeak(work_dir):
    os.chdir(work_dir)
    m6a_up = pd.read_table("diff_peak_c3.0_cond1_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    m6a_common = pd.read_table("diff_peak_c3.0_common_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    m6a_down = pd.read_table("diff_peak_c3.0_cond2_anno.txt", sep="\t", header=None, names=["name", "m6a_score"])
    # m6a score 计算方式
    df_m6a_up = m6a_up.groupby(["name"]).mean().reset_index()
    df_m6a_common = m6a_common.groupby(["name"]).mean().reset_index()
    df_m6a_down = m6a_down.groupby(["name"]).mean().reset_index()
    return df_m6a_up, df_m6a_common, df_m6a_down
    
def merge_m6a_and_methyl(df_m6a, df_methyl_up, df_methyl_down):
    df_up = df_m6a.merge(df_methyl_up, on="name", how="right").dropna().drop_duplicates()
#     df_common = df_m6a.merge(df_methyl_common, on="name", how="right").dropna().drop_duplicates()
    df_down = df_m6a.merge(df_methyl_down, on="name", how="right").dropna().drop_duplicates()
    return df_up, df_down

In [10]:
# def format_gene_name(df):
#     df["name"] = df["Unnamed: 0"].str.split(".").str[0]
# #     df = df.set_index(["name"])
#     del df["Unnamed: 0"]
#     return df

# def class_gene_by_diffexpre(DESeq2_file):
#     df = pd.read_csv(DESeq2_file)
#     df = df[(abs(df.log2FoldChange) > 1) & (df.padj < 0.05)]
#     df_up, df_down = df.copy()[df.log2FoldChange > 0], df.copy()[df.log2FoldChange < 0]
# #     print(len(df_up), len(df_down))
#     return format_gene_name(df_up), format_gene_name(df_down)

In [9]:
## 2.we find the Spearman rho between a row of expression data and each row of methylation data, and we keep the row of methylation data that produces the most-negative rho value
def filter_each_row_by_expression(df, df_gene_up, df_gene_down):
#     print(len(df))
    df_pos, df_neg = df[df.logFC >0], df[df.logFC <0]
#     print(len(df_pos), len(df_neg))
#     print(df_pos.head())
#     print(df_gene_up.head())
    up_genes, down_genes, total_genes = df_gene_up.name, df_gene_down.name, list(set(df["name"].tolist()))
#     remain_genes = [x for x in total_genes if (x not in up_genes) and (x not in down_genes)]
    df_pos = df_pos[df_pos["name"].isin(down_genes)]
    df_neg = df_neg[df_neg["name"].isin(up_genes)]
#     print(len(df_pos), len(df_neg))
    df_coordinate = pd.concat([df_pos, df_neg]).dropna()
#     df_unchange = df[df["name"].isin(remain_genes)]
#     print(len(df), len(df_coordinate), len(df_unchange))
    return df_coordinate

In [8]:
def intersect_m6a_expre_methyl(df_type, df_list, df_methyl):
    df = df_list[0].merge(df_list[1], on="name", how="left").merge(df_methyl, on="name", how="left").dropna()
#     if df_type == "up":
#         df_coor = df[(df["logFC"] < 0) & (abs(df["logFC"]) > 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#         df_unchange = df[(df["logFC"] < 0) & (abs(df["logFC"]) < 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#     elif df_type == "down":
#         df_coor = df[(df["logFC"] > 0) & (abs(df["logFC"]) > 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#         df_unchange = df[(df["logFC"] > 0) & (abs(df["logFC"]) < 1)][["name", "m6a_score", "log2FoldChange", "logFC"]]
#     else:
#         print("%s is not identified!" % df_type)
#     print(df.head(), len(df))
#     return df_coor, df_unchange
    return df

In [7]:
def filter_intersect(df_up, df_down, df_remain):
#     print("before", len(df_up), len(df_down), len(df_remain))
    up_genes, down_genes, remain_genes = set(df_up["name"]), set(df_down["name"]), set(df_remain["name"])
    uniq_up = up_genes - down_genes - remain_genes
    uniq_down = down_genes - up_genes - remain_genes
    uniq_remain = remain_genes - up_genes - down_genes
    df_up, df_down, df_remain = select_uniq(uniq_up, df_up), select_uniq(uniq_down, df_down), select_uniq(uniq_remain, df_remain)
#     print("after", len(uniq_up), len(uniq_down), len(uniq_remain))
#     print("after", len(df_up), len(df_down), len(df_remain))
    return df_up, df_down, df_remain
    
def select_uniq(query_list, df):
    df = df[df["name"].isin(query_list)]
    return df