In [4]:
import os
import sys
# sys.setdefaultencoding("utf-8")
import shutil
import glob
import pandas as pd
import numpy as np
from scipy import stats
import subprocess
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from multiprocessing import Pool
from collections import Counter

In [36]:
os.chdir("/data5/galaxy/project/lncRNA_analysis/enhancer")
data_dir = "/data3/xs/tissue_m6a/paper_data/enhancer/111_genome/8tissue_markers_2/enhancers/"
bed_list = glob.glob("%s/*7_Enh.bed" % data_dir)
#
lnc_bed = "/data3/xs/tissue_m6a/2018.1/GRCh38_segment/intergenic_lincRNA.bed"
df_lnc = read_bed(lnc_bed)
total_gene_dir = "/data5/galaxy/project/promoter_TF_enrich/data/total_gene/gene_bed"
m6a_gene_dir = "/data5/galaxy/project/promoter_TF_enrich/data/m6a_gene/gene_bed"
#
                # m6a-lincRNA  unm6a-lincRNA
# enhancer            a             b
# non_enhancer        c             d
for enhancer in bed_list:
    tissue = os.path.basename(enhancer).split("_")[0].lower()
    print(tissue)
    gene_bed = "%s/%s.bed" % (total_gene_dir, tissue)
    df_gene = read_bed(gene_bed)
    m6a_names, unm6a_names = process_single_tissue(gene_bed, df_lnc)
    df_m6a, df_unm6a = df_gene[df_gene["name"].isin(m6a_names)], df_gene[df_gene["name"].isin(unm6a_names)]
    a = stat_intersect_num(df_m6a, enhancer)
    b = stat_intersect_num(df_unm6a, enhancer)
    c, d = len(df_m6a) - a, len(df_unm6a) - b
    print(a, b, c, d)
    fisher(a, b, c, d)

liver
305 686 180 1133
2.7985503725299643 3.865525529628486e-23
heart
358 952 143 1376
3.6184991479109128 5.426890224773739e-36
placenta
438 520 252 942
3.1486263736263735 5.745688587399145e-34
brain
260 651 576 2505
1.736911162314388 4.4937427463108555e-10
muscle
380 476 275 829
2.406569900687548 1.761948954894609e-19
stomach
220 494 196 1013
2.301702057341155 1.4582434724534782e-13
kidney
431 425 536 1262
2.3877172958735735 3.0062356081848302e-24
lung
275 646 196 934
2.0285745877298287 2.454264734157084e-11


In [8]:
def fisher(a, b, c, d):
    oddsratio, pvalue = stats.fisher_exact([[a, b], [c, d]])
    print(oddsratio, pvalue)

In [28]:
def process_single_tissue(gene_bed, df_lnc):
    tissue = os.path.basename(gene_bed)
    m6a_bed = os.path.join(m6a_gene_dir, tissue)
    df_gene, df_m6a = read_bed(gene_bed), read_bed(m6a_bed)
    df_one = df_gene.merge(df_lnc, on="name", how="right").dropna()
#     total_names = list(set(df_one["name"].tolist()))
    df_two = df_one.merge(df_m6a, on="name", how="left").dropna()
#     m6a_names = list(set(df_two["name"].tolist()))
    unm6a_names = list(set([x for x in df_one["name"].tolist() if x not in df_two["name"].tolist()]))
    df_three = df_one[df_one["name"].isin(unm6a_names)].dropna()
#     remain_names = list(set(df_three["name"].tolist()))
    return df_two["name"].tolist(), df_three["name"].tolist()

In [34]:
def stat_intersect_num(df, enhancer_bed):
    df_str = "\n".join(["\t".join(line.split()) for line in df.to_string(header=False, index=False).split("\n")])
    command = "bedtools intersect -a stdin -b %s -wa | sort | uniq | wc -l" % enhancer_bed
#     command = "bedtools intersect -a stdin -b %s -wa | sort | uniq" % enhancer_bed
    sub_p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    number = int(sub_p.communicate(df_str.encode("utf-8"))[0].decode("utf-8"))
#     number = sub_p.communicate(df_str.encode("utf-8"))[0].decode("utf-8")
    return number

In [7]:
def read_bed(in_bed):
    df = pd.read_table(in_bed, sep="\s+", header=None, names=["chr", "start", "end", "name", "s", "strand"])
#     print(df.head())
    df["name"] = df["name"].str.split(".").str[0]
    return df