In [1]:
import os
import sys
import csv
import datetime
import shutil
import random
import glob
import pandas as pd
import numpy as np
from scipy import stats
import subprocess
from multiprocessing import Pool

In [8]:
mirna = "/home/galaxy/data/miRNA_area/starBase2/human_hg19/miRNA_interaction.txt"
eQTL_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
HapMap_snp = "/home/galaxy/data/QTL/HapMap_SNP/autosomal_GRCh38_snp.bed"
result_dir = "/home/galaxy/data/miRNA_area/starBase2/human_hg19/bed_dir"

In [4]:
df_total = pd.read_table(mirna, sep="\t")
df_total.head()

Unnamed: 0,name,geneName,position
0,hsa-miR-200b-3p,DEK,chr6:18225142-18225166[-]
1,hsa-miR-200b-3p,PELO,chr5:52097819-52097842[+]
2,hsa-miR-200b-3p,FBN2,chr5:127593766-127593772[-]
3,hsa-miR-200b-3p,MAN2A1,chr5:109202925-109202945[+]
4,hsa-miR-200b-3p,RNF160,chr21:30301135-30301157[-]


In [26]:
df_total.groupby(["name"]).apply(format_data)

In [25]:
def format_data(df):
    mirna = df["name"].tolist()[0]
    df["ID"] = df["name"] + ";" + df["geneName"] + ";" + df["position"]
    df["Chr name"] = df["position"].str.split(":").str[0]
    df["Start"] = df["position"].str.split(":").str[1].str.split("-").str[0].astype(int)
    df["End"] = df["position"].str.split("-").str[1].str.split("[").str[0].astype(int)
    df_result = df[["Chr name", "Start", "End", "ID"]]
    df_result = df_result.sort_values(["Chr name", "Start"])
    result_file = os.path.join(result_dir, "%s.bed" % mirna)
    df_result.to_csv(result_file, sep="\t", header=False, index=False)

#### Generate Data Table

In [33]:
# Map m6a-related eQTL SNPs to the miRNA sites
##############################################################################################
data_dir = "/home/galaxy/data/miRNA_area/starBase2/GRCh38_bed_dir"
eqtl_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
result_dir = "/home/galaxy/project/QTL_analysis/related_with_miRNA/eQTL/data_table"
##############################################################################################
mirna_list = glob.glob("%s/*.bed" % data_dir)
eqtl_list = glob.glob("%s/*.bed" % eqtl_dir)
for eqtl in eqtl_list:
    for mirna in mirna_list:
        basename = "%s-%s" % (os.path.basename(eqtl).split(".bed")[0], os.path.basename(mirna))
        result_file = os.path.join(result_dir, basename)
        os.system("bedtools intersect -a %s -b %s -wa -wb > %s" % (eqtl, mirna, result_file))

#### Enrichment analysis

In [30]:
data_dir = "/home/galaxy/data/miRNA_area/starBase2/GRCh38_bed_dir"
mirna_list = glob.glob("%s/*.bed" % data_dir)
eQTL_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
eQTL_list = glob.glob("%s/*.bed" % eQTL_dir)
HapMap_NUMBER = 81039058
result_dir = "/home/galaxy/project/QTL_analysis/related_with_miRNA/eQTL/Fisher_exact_test"

In [31]:
def write_to_file(res_list, res_file):
    res_list = ["%s\n" % x.get() for x in res_list]
    with open(res_file, "w") as fw:
        fw.writelines(res_list)

def fisher_test(eqtl_bed, RPB_bed):
         #              RPB_bed       x
        # QTL_SNP         a       m6a - a
        # HapMap_SNP      c       con - c
    QTL_num = stat_file_line_numbers(eqtl_bed)
    a = os.popen("bedtools intersect -a %s -b %s -wa |sort| uniq | wc -l" % (eqtl_bed, RPB_bed)).read()
    c = os.popen("bedtools intersect -a %s -b %s -wa |sort| uniq | wc -l" % (HapMap_snp, RPB_bed)).read()
    b, d = QTL_num - int(a), HapMap_NUMBER - int(c)
    oddsratio, pvalue = stats.fisher_exact([[int(a), b], [int(c), d]])
    line="%s\t%s\t%f\t%f"%(os.path.basename(eqtl_bed).split("-")[0], os.path.basename(RPB_bed), oddsratio, pvalue)
    return line

def stat_file_line_numbers(in_file):
    raw_return = os.popen("wc -l %s" % in_file)
    line_num = int(raw_return.read().split()[0])
    return line_num

#####
for eqtl in eQTL_list:
    res_list, pool = [], Pool(processes=30)
    for mirna in mirna_list:
#         res_list = fisher_test(eqtl, rbp)
        res = pool.apply_async(fisher_test, (eqtl, mirna, ))
        res_list.append(res)
    pool.close()
    pool.join()
    write_to_file(res_list, os.path.join(result_dir, os.path.basename(eqtl).split("-")[0]))