In [1]:
import os
import glob
import gzip
from scipy import stats
from multiprocessing import Pool
from statsmodels.stats.multitest import fdrcorrection

os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/")

In [24]:
def generate_totalNum_dict():
    df_map = pd.read_table("/home/galaxy/project/alleleSpecific_analysis/data/bam_wasp/total_Input_vs_IP.txt")
    map_dict = dict(zip(df_map['IP'], df_map['Input']))
    #
    summary_file = "/home/galaxy/project/alleleSpecific_analysis/data/bam_wasp/stat/AllSamples.bam_stat.txt"
    df_stat = pd.read_table(summary_file, header=None)
    number_dict = dict(zip(df_stat.iloc[:,0].tolist(), df_stat.iloc[:,1].tolist()))
    #
    totalNum_dict = {}
    for sample in number_dict.keys():
        if sample in map_dict:
            # ip_sample: [ip_num, input_sample, input_num]
            totalNum_dict[sample] = [number_dict[sample], map_dict[sample], number_dict[map_dict[sample]]]
    return totalNum_dict

In [31]:
totalNum_dict = generate_totalNum_dict()

def preprocess_df(ip_prefix):
    input_prefix = totalNum_dict[ip_prefix][1]
    ip_file, input_file = "ip_count/%s.readcounts.txt" % ip_prefix, "input_count/%s.readcounts.txt" % input_prefix
    df_ip, df_input = pd.read_table(ip_file), pd.read_table(input_file)
    
    df_results = []
    totalNum_list = [totalNum_dict[ip_prefix][0], totalNum_dict[ip_prefix][2]]
    treat_list, df_list = ['ip', 'input'], [df_ip, df_input]
    for i in range(len(df_list)):
        df, treat, total_num = df_list[i], treat_list[i], totalNum_list[i]
        df_sub = df[['contig', 'position', 'refAllele', 'altAllele', 'refCount', 'altCount']]
        df_sub = df_sub[(df_sub['refCount'] >= 5) & (df_sub['altCount'] >= 5)]
        df_sub['refRPKM_%s' % treat] = (df_sub['refCount'] / total_num) * 1000000000
        df_sub['altRPKM_%s' % treat] = (df_sub['altCount'] / total_num) * 1000000000
        df_results.append(df_sub)
    # df_sub.to_csv("%s/%s" % (result_dir, os.path.basename(x)), sep="\t", index=False)
    return df_results[0], df_results[1]

In [41]:
######################## Fisher exact test
result_dir = "Fisher_results"
os.system("mkdir -p %s" % result_dir)
#
def fisher_exact_test_each(ip_prefix):
    result_file = os.path.join(result_dir, ip_prefix+".txt")
    df_ip, df_input = preprocess_df(ip_prefix)
    df = df_ip.merge(df_input, on=['contig', 'position', 'refAllele', 'altAllele'], how='left').dropna(how="any")
    df['refRPKM_ratio'] = df['refRPKM_ip'] / df['refRPKM_input']
    df['altRPKM_ratio'] = df['altRPKM_ip'] / df['altRPKM_input']
    df['foldchange'] = df['altRPKM_ratio'] / df['refRPKM_ratio']
    pvalue_list , odds_list = [], []
    for i,j in df.iterrows():
        a,b = round(j['refRPKM_ip']), round(j['altRPKM_ip'])
        c,d = round(j['refRPKM_input']), round(j['altRPKM_input'])
        oddsratio,pvalue=stats.fisher_exact([[a,b], [c,d]])
        pvalue_list.append(pvalue)
        odds_list.append(oddsratio)
    qvalue_list = list(fdrcorrection(pvalue_list)[1])
    df['pvalue'],df['oddsratio'],df['qvalue'] = pvalue_list,odds_list,qvalue_list
    df_sub=df[['contig','position','refAllele','altAllele','refRPKM_ratio','altRPKM_ratio', 'foldchange', 'pvalue', 'oddsratio','qvalue']]
    df_sub.to_csv(result_file, sep="\t", index=False)

exist_file = [os.path.basename(x).split(".")[0] for x in glob.glob("Fisher_results/*.txt")]
prefix_list = totalNum_dict.keys()
undo_list = [x for x in prefix_list if x not in exist_file]

pool = Pool()
for ip_prefix in undo_list:
    pool.apply_async(fisher_exact_test_each, (ip_prefix, ))
pool.close()
pool.join()
# for sample in ['CRR055549', 'CRR042296', 'CRR055561', 'CRR055551', 'CRR042316', 'CRR042282', 'CRR055563', 'mus_IP_4', 'CRR042306', 'CRR055559']:
#     print(sample)
#     fisher_exact_test_each(sample)

In [1]:
# os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/Fisher_test/")
# result_dir = "sig01"
# os.system("mkdir -p %s" % result_dir)

# file_list = glob.glob("*.txt")
# for x in file_list:
#     df = pd.read_table(x)
#     # df_sub = df[(df['pvalue']<0.05) & (df['qvalue']<0.05)]
#     df_sub = df[df['qvalue']<0.1]
#     # df_sub = df_sub[(df_sub['foldchange']>1.5) | (df_sub['foldchange']<0.67)]
#     df_sub.to_csv(os.path.join(result_dir, x), sep="\t", index=False)


# os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/Fisher_test/sig/filter/")
# result_dir = "bed"
# os.system("mkdir -p %s" % result_dir)
# file_list = glob.glob("*.txt")
# for x in file_list:
#     df = pd.read_table(x)
#     df['mark'] = np.where(df['foldchange']>1, "alt", "ref")
#     df['end'] = df['position'] + 1
#     df_sub = df[['contig','position','end', 'mark']]
#     df_sub.to_csv(os.path.join(result_dir, x.replace(".txt",".bed")), sep="\t", header=False, index=False)
    
#
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/Fisher_test/sig/bed/")
map_file = "../../map.txt"
df = pd.read_table(map_file, sep="\t", header=None)
map_dict = dict(zip(df.iloc[:,0], df.iloc[:,1]))
print(map_dict['CRR073020'])
for crr, tissue in map_dict.items():
    os.system("mv %s.bed %s.bed" % (crr, tissue))

Lung-4-4


In [4]:
# merge all the samples which from the same individual
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/Fisher_test/sig/")
result_dir = "../by_ind/"
os.system("mkdir -p %s" % result_dir)

ind_dict = {"individual_1": ['brain_IP_1', 'heart_IP_1', 'liver_IP_1'],  
            "individual_2": ['brain_IP_2', 'heart_IP_2', 'pla_IP_2', 'kid_IP_2', 'liver_IP_2'],  
            "individual_3": ['brain_IP_3', 'heart_IP_3', 'liver_IP_3', 'kid_IP_3'], 
            "individual_4": ['sto_IP_4', 'mus_IP_4', 'lung_IP_4', 'kid_IP_4', 'pla_IP_4'], 
            "individual_5": ['sto_IP_5', 'mus_IP_5', 'lung_IP_5'], 
            "individual_6": ['pla_IP_6'],
            "individual_7": ['CRR073020', 'CRR042306', 'CRR042296', 'CRR042318', 'CRR042310', 'CRR042300', 'CRR042280', 'CRR042312', 'CRR055561', 'CRR055551', 'CRR055563', 'CRR042302', 'CRR042294', 'CRR042290'], 
            "individual_8": ['CRR042308', 'CRR042286', 'CRR042282', 'CRR042284', 'CRR042298', 'CRR042316', 'CRR042314', 'CRR055557', 'CRR055555', 'CRR042288', 'CRR055559', 'CRR042320', 'CRR042292'], 
            "individual_9": ['CRR073018', 'CRR055537', 'CRR055536', 'CRR055539', 'CRR055533'],
            "individual_10": ['CRR055549', 'CRR055547', 'CRR055545', 'CRR055543', 'CRR055541'],
            "individual_11": ['CRR055525', 'CRR055527', 'CRR042278', 'CRR055529', 'CRR055531', 'CRR042304'], 
            "individual_12": ['CRR055553'],
            "individual_13": ['CRR073016'],
           }
ind_list = ind_dict.keys()

for ind in ind_list:
    sample_list = ind_dict[ind]
    file_list = ['%s.txt'%x for x in sample_list]
    df = pd.read_table(file_list[0], sep="\t")
    # foldchange = alt_allele / ref_allele
    df["effect_%s" % sample_list[0]] = df['foldchange']

    for sample in file_list[1:]:
        prefix = os.path.basename(sample).split(".txt")[0]
        df_i = pd.read_table(sample, sep="\t")
        df_i["effect_%s" % prefix] = df_i['foldchange']
        df = df.merge(df_i, on=["contig", "position", "refAllele", "altAllele"], how="outer")
    selected_cols = ["effect_%s" % x for x in sample_list]
    if len(sample_list) >= 2:
        df = df[["contig", "position", "refAllele", "altAllele"]+selected_cols].dropna(thresh=6)
    else:
        df = df[["contig", "position", "refAllele", "altAllele"]+selected_cols]
#     print(df.head())
    df.to_csv(os.path.join(result_dir, ind+".txt"), sep="\t", index=False)

In [5]:
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/Fisher_test/by_ind/")
result_dir = "sig"
os.system("mkdir -p %s" % result_dir)

ind_list = glob.glob("individual_*.txt")
for ind in ind_list:
    print(ind)
    df = pd.read_table(ind, sep="\t")
    count = 0
    sample_num = len([x for x in df.columns if "effect_" in x])
    if sample_num == 1:
        count = len(df)
        os.system("cp %s %s/" % (ind, result_dir))
    else:
        with open("%s/%s" % (result_dir, ind), 'w') as fw:
            fw.write("\t".join(df.columns) + "\n")
            for i, value in df.iterrows():
                effect_list = value[4:].dropna(how=any).tolist()
                if len(effect_list) > 1:  # at least 2 sample
                    up_num = [x for x in effect_list if x > 1.5]
                    down_num = [x for x in effect_list if x < 0.67]
                    if (len(up_num) == len(effect_list)) or (len(down_num) == len(effect_list)):
                        fw.write("\t".join([str(x) for x in value]) + "\n")
                        count += 1
    print(count)

individual_6.txt
2117
individual_7.txt
1280
individual_2.txt
723
individual_3.txt
479
individual_5.txt
257
individual_9.txt
377
individual_13.txt
2000
individual_11.txt
1670
individual_8.txt
1423
individual_12.txt
2559
individual_1.txt
244
individual_4.txt
724
individual_10.txt
398


###################################################################################################
################################   allele-specific expression #####################################

In [3]:
from statsmodels.stats.multitest import fdrcorrection
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/input_count/")
result_dir = "../binomial_test/"

input_list = glob.glob("*.readcounts.txt")
for x in input_list:
    result_file = os.path.join(result_dir, x.replace("readcount", "binomial"))
    df = pd.read_table(x)
    df = df[(df['refCount'] > 5) & (df['altCount'] > 5)]
    df['allelicRatio'] = df['altCount'] / (df['refCount'] + df['altCount'])
    pro = np.median(df['allelicRatio'])
    print(x, pro)
    pvalue_list = []
    for a, b in df.iterrows():
        ref, alt = b['refCount'], b['altCount']
        total = ref + alt
        pvalue = stats.binom_test(alt, n=total, p=pro)
        pvalue_list.append(pvalue)
    qvalue_list = list(fdrcorrection(pvalue_list)[1])
    df['bino_pvalue'] = pvalue_list
    df['qvalue'] = qvalue_list
    df.to_csv(result_file, sep="\t", index=False)

CRR042293.readcounts.txt 0.44
CRR055538.readcounts.txt 0.4122137404580153
CRR042299.readcounts.txt 0.4444444444444444
CRR042305.readcounts.txt 0.45454545454545453
sto_4_L3.readcounts.txt 0.4857142857142857
CRR055562.readcounts.txt 0.4074074074074074
CRR055542.readcounts.txt 0.4239062915964149
CRR055532.readcounts.txt 0.43478260869565216
CRR042311.readcounts.txt 0.4358974358974359
CRR055528.readcounts.txt 0.4666666666666667
CRR042307.readcounts.txt 0.4375
lung_5_L7.readcounts.txt 0.4827586206896552
brain_1_L6.readcounts.txt 0.4888888888888889
kid_3_L8.readcounts.txt 0.49612403100775193
CRR042315.readcounts.txt 0.4375
CRR042297.readcounts.txt 0.4375
liver_3_L6.readcounts.txt 0.5
CRR073017.readcounts.txt 0.42857142857142855
liver_1_L6.readcounts.txt 0.48936170212765956
CRR055535.readcounts.txt 0.41379310344827586
CRR042317.readcounts.txt 0.4444444444444444
CRR055544.readcounts.txt 0.43159536541889487
CRR042285.readcounts.txt 0.4375
CRR042309.readcounts.txt 0.44986772486772486
CRR042291.re

In [4]:
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/binomial_test/")
result_dir = "sig"
os.system("mkdir -p %s" % result_dir)

file_list = glob.glob("*.txt")
for x in file_list:
    df = pd.read_table(x)
    df_sub = df[(df['bino_pvalue']<0.05) & (df['qvalue']<0.05)]
    # df_sub = df_sub[(df_sub['foldchange']>1.5) | (df_sub['foldchange']<0.67)]
    df_sub.to_csv(os.path.join(result_dir, x), sep="\t", index=False)

In [7]:
# merge all the samples which from the same individual
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/binomial_test/sig/")
result_dir = "../by_ind/"
os.system("mkdir -p %s" % result_dir)

ind_dict = {"individual_1": ['brain_1_L6', 'heart_1_L6', 'liver_1_L6'],  
            "individual_2": ['brain_2_L7', 'heart_2_L7', 'pla_2_L3', 'kid_2_L7', 'liver_2_L7'],  
            "individual_3": ['brain_3_L68', 'heart_3_L6', 'liver_3_L6', 'kid_3_L8'], 
            "individual_4": ['sto_4_L3', 'mus_4_L6', 'lung_4_L7', 'kid_4_L6', 'pla_4_L6'], 
            "individual_5": ['sto_5_L7', 'mus_5_L7', 'lung_5_L7'], 
            "individual_6": ['pla_6_L6'],
            "individual_7": ['CRR073021', 'CRR042307', 'CRR042297', 'CRR042319', 'CRR042311', 'CRR042301', 'CRR042281', 'CRR042313', 'CRR055562', 'CRR055552', 'CRR055564', 'CRR042303', 'CRR042295', 'CRR042291'], 
            "individual_8": ['CRR042309', 'CRR042287', 'CRR042283', 'CRR042285', 'CRR042299', 'CRR042317', 'CRR042315', 'CRR055558', 'CRR055556', 'CRR042289', 'CRR055560', 'CRR042321', 'CRR042293'], 
            "individual_9": ['CRR073019', 'CRR055538', 'CRR055535', 'CRR055540', 'CRR055534'],
            "individual_10": ['CRR055550', 'CRR055548', 'CRR055546', 'CRR055544', 'CRR055542'],
            "individual_11": ['CRR055526', 'CRR055528', 'CRR042279', 'CRR055530', 'CRR055532', 'CRR042305'], 
            "individual_12": ['CRR055554'],
            "individual_13": ['CRR073017'],
           }
ind_list = ind_dict.keys()

for ind in ind_list:
    sample_list = ind_dict[ind]
    file_list = ['%s.binomials.txt'%x for x in sample_list]
    df = pd.read_table(file_list[0], sep="\t")
    df["effect_%s" % sample_list[0]] = df['altCount'] / df['refCount']

    for sample in file_list[1:]:
        prefix = os.path.basename(sample).split(".binomials.txt")[0]
        df_i = pd.read_table(sample, sep="\t")
        df_i["effect_%s" % prefix] = df_i['altCount'] / df_i['refCount']
        df = df.merge(df_i, on=["contig", "position", "refAllele", "altAllele"], how="outer")
    selected_cols = ["effect_%s" % x for x in sample_list]
    if len(sample_list) >= 2:
        df = df[["contig", "position", "refAllele", "altAllele"]+selected_cols].dropna(thresh=6)
    else:
        df = df[["contig", "position", "refAllele", "altAllele"]+selected_cols]
    df.to_csv(os.path.join(result_dir, ind+".txt"), sep="\t", index=False)

In [8]:
os.chdir("/home/galaxy/project/alleleSpecific_analysis/results/ASE_analysis/readCount/binomial_test/by_ind/")
result_dir = "sig"
os.system("mkdir -p %s" % result_dir)

ind_list = glob.glob("individual_*.txt")
for ind in ind_list:
    print(ind)
    df = pd.read_table(ind, sep="\t")
    count = 0
    sample_num = len([x for x in df.columns if "effect_" in x])
    if sample_num == 1:
        count = len(df)
        os.system("cp %s %s/" % (ind, result_dir))
    else:
        with open("%s/%s" % (result_dir, ind), 'w') as fw:
            fw.write("\t".join(df.columns) + "\n")
            for i, value in df.iterrows():
                effect_list = value[4:].dropna(how=any).tolist()
                if len(effect_list) > 1:  # at least 2 sample
                    up_num = [x for x in effect_list if x >= 1]
                    down_num = [x for x in effect_list if x < 1]
                    if (len(up_num) == len(effect_list)) or (len(down_num) == len(effect_list)):
                        fw.write("\t".join([str(x) for x in value]) + "\n")
                        count += 1
    print(count)

individual_6.txt
1980
individual_7.txt
4095
individual_2.txt
927
individual_3.txt
851
individual_5.txt
522
individual_9.txt
1372
individual_13.txt
33338
individual_11.txt
4492
individual_8.txt
4944
individual_12.txt
15735
individual_1.txt
319
individual_4.txt
1248
individual_10.txt
1266
