In [74]:
import os
import re
import glob
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import subprocess
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [84]:
tf_dir = "/data5/galaxy/project/what/TF_narrowPeak/total_TF_narrowPeak/03_hg19_to_GRCh38"
cebpz = "%s/CEBPZ.bed" % tf_dir
smad2 = "%s/SMAD2.bed" % tf_dir
stat3 = "%s/STAT3.bed" % tf_dir
reference_genome = "/data/database/GRCh38/GENCODE/GRCh38.primary_assembly.genome.fa"
high_bed = "/data5/galaxy/project/CpG_m6a_motif/fasta_seq/high_CpG.bed"
low_bed = "/data5/galaxy/project/CpG_m6a_motif/fasta_seq/low_CpG.bed"
tf_list = [cebpz, smad2]
result_file = "/data5/galaxy/project/tf_analysis/tf_and_PromoterCpG/result.bed"

In [89]:
CpG_bed = "/data5/galaxy/shell_dir/2018_3_17/jupyter_shell/total_CpG.bed"
os.system("cat %s %s | sort -k1,1 -k2,2n | uniq > %s" % (high_bed, low_bed, CpG_bed))
df_list = []
for tf in tf_list:
    df = process_sinal_tf(tf)
    df_list.append(df)
df_result = pd.concat(df_list)
df_result.to_csv(result_file, sep="\t", header=False, index=False)
# os.remove(CpG_bed)

0

CEBPZ.bed
positive number: 20098		negative number: 3640
>0.35 fraction: 0.652204
SMAD2.bed
positive number: 16745		negative number: 6993
>0.35 fraction: 0.697104


In [85]:
def process_sinal_tf(tf):
    print(os.path.basename(tf))
    p_title_list, p_score_list = enrich_CpG_in_promoter(tf, "positive")
    n_title_list, n_score_list = enrich_CpG_in_promoter(tf, "negative")
    df = format_dataframe(tf, p_score_list, n_score_list)
    print("positive number: %d\t\tnegative number: %d" % (len(p_score_list), len(n_score_list)))
    sig_list = [x for x in p_score_list if x > 0.35]
    print(">0.35 fraction: %f" % (len(sig_list) / len(p_score_list)))
#     print(np.median(p_score_list), np.median(n_score_list))
    return df

In [81]:
def format_dataframe(tf, p_score_list, n_score_list):
    tf_name = os.path.basename(tf).split(".")[0]
    n_list = [tf_name for x in p_score_list]
    p_list = ["positive" for x in p_score_list]
    df_p = pd.DataFrame({"tf": n_list, "type": p_list, "value": p_score_list})
    #
    x_list = [tf_name for x in n_score_list]
    neg_list = ["negative" for x in n_score_list]
    df_n = pd.DataFrame({"tf": x_list, "type": neg_list, "value": n_score_list})
    df = pd.concat([df_p, df_n])
    return df

In [86]:
def enrich_CpG_in_promoter(tf, data_type):
    in_bed = get_intersect_bed(tf, data_type)
    str_seq_list, title_list, score_list = str(get_sequence_from_bed(in_bed)).split("\n"), [], []
#     print(len(str_seq_list))
    for i in range(0, len(str_seq_list)-1, 2):
        title, seq = str_seq_list[i], str_seq_list[i+1]
        title_list.append(title)
        score_list.append(calculate_CpG_density(seq))
#     print(score_list[0])
    return title_list, score_list

In [88]:
def get_intersect_bed(tf, data_type):
#     print(CpG_bed)
    if data_type == "positive":
        command = "bedtools intersect -a %s -b %s -wa -F 0.5 | sort | uniq" % (CpG_bed, tf)
    elif data_type == "negative":
        command = "bedtools intersect -a %s -b %s -wa -v -F 0.5 | sort | uniq" % (CpG_bed, tf)
    sub_p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    bed_str = str(sub_p.communicate()[0].decode("utf-8").strip())
    final_str = "\n".join(["\t".join(x.split()) for x in bed_str.split("\n")])
#     print(final_str)
    return final_str

def get_sequence_from_bed(in_bed):
    command = "bedtools getfasta -fi %s -bed stdin" % (reference_genome)
    sub_p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    str_sequence = sub_p.communicate(in_bed.encode("utf-8"))[0].decode("utf-8")
#     print(str_sequence.split("\n")[0])
#     print(str_sequence)
    return str_sequence

def calculate_CpG_density(sequence):
    seq = sequence.lower()
    cg_num, c_num, g_num = len(re.findall("cg", seq)), len(re.findall("c", seq)), len(re.findall("g", seq))
    score = cg_num / (math.pow((c_num + g_num) / 2.0, 2) / (len(seq) * 1.0))
    return score