In [7]:
import os
import sys
import csv
import datetime
import shutil
import random
import glob
import pandas as pd
import numpy as np
from scipy import stats
import subprocess
from multiprocessing import Pool
from statsmodels.stats import multitest

In [3]:
## haploview = "/home/galaxy/software/Haploview.jar"
# Perform LD analysis for each GWAS SNP to obtain its LD mutations with a parameter of r2 > 0.8
# http://icsnpathway.psych.ac.cn/  webserver
data_dir = "/home/galaxy/data/disease/GWAS/ICSNPathway"
result_dir = "/home/galaxy/data/disease/GWAS/ICSNPathway/result_dir"

In [8]:
data_list = glob.glob("%s/*.txt" % data_dir)
result_file = os.path.join(result_dir, "GWAS_and_LD-mutations.txt")
df_list = []
for data in data_list:
    df = pd.read_table(data, sep="\t")
    df["query SNP"] = df["Candidate causal SNP"].str.split().str[0]
    df["LD SNP"] = df["In LD with"].str.split().str[0]
    df["Type"] = np.where(df["query SNP"]==df["LD SNP"], "TagSNP", "LD_derived")
    df_list.append(df[["query SNP", "LD SNP", "Type"]])
df_result = pd.concat(df_list).drop_duplicates()
df_result.to_csv(result_file, sep="\t", index=False)

In [10]:
# merge icsnpathway result with GWAS SNPs
icsn_file = "/home/galaxy/data/disease/GWAS/ICSNPathway/result_dir/GWAS_and_LD-mutations.txt"
gwas = "/home/galaxy/data/disease/GWAS/total_result.txt"
result_file = "/home/galaxy/data/disease/GWAS/total_result_with_LD.txt"
df_icsn = pd.read_table(icsn_file, sep="\t")
df_gwas = pd.read_table(gwas, sep="\t")
df_merge = df_gwas.merge(df_icsn, how="left", left_on="SNPS", right_on="query SNP").dropna(how="any")
df_merge.to_csv(result_file, sep="\t", index=False)

In [11]:
# Map m6a-related eQTL SNPs to the icsnpathway result
##############################################################################################
eqtl_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
gwas_file = "/home/galaxy/data/disease/GWAS/total_result_with_LD.txt"
result_dir = "/home/galaxy/project/QTL_analysis/related_with_disease/GWAS"
##############################################################################################
df_gwas = pd.read_table(gwas_file, sep="\t")
eqtl_list = glob.glob("%s/*.bed" % eqtl_dir)
for eqtl in eqtl_list:
    result_file = os.path.join(result_dir, os.path.basename(eqtl).replace(".bed", ".txt"))
    df_eqtl = pd.read_table(eqtl, sep="\t", header=None, names=["Chr", "Start", "End", "Info"])
    df_eqtl["SNPS"] = df_eqtl.iloc[:, 3].str.split(";").str[2]
    df_eqtl["Gene"] = df_eqtl.iloc[:, 3].str.split(";").str[0]
#     print(df_eqtl.head())
    df_merge = df_eqtl.merge(df_gwas, how="left")
    df_merge.to_csv(result_file, sep="\t", index=False)

#### Enrichment analysis

In [3]:
eqtl_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
gwas_dir = "/data5/galaxy/project/GWAS_db/04_class_snp/cluster_by_map/snp_classification/"
result_dir = "/home/galaxy/project/QTL_analysis/related_with_disease/GWAS/enrichment_result"
HapMap_snp = "/home/galaxy/data/QTL/HapMap_SNP/autosomal_GRCh38_snp.bed"
hapmap_list = pd.read_table(HapMap_snp, sep="\t", header=None).iloc[:, -1].tolist()
HapMap_NUMBER = 81039058

In [5]:
HapMap_NUMBER = 81039058
hapset = set(hapmap_list)

In [6]:
gwas_list = glob.glob("%s/*.bed" % gwas_dir)
eqtl_list = glob.glob("%s/*.bed" % eqtl_dir)
for eqtl in eqtl_list:
    res_list, pool = [], Pool(processes=25)
    for gwas in gwas_list:
#         fisher_test(eqtl, gwas)
        res = pool.apply_async(fisher_test, (eqtl, gwas, ))
        res_list.append(res)
    pool.close()
    pool.join()
    write_to_file(res_list, os.path.join(result_dir, os.path.basename(eqtl).split("-")[0]))

0 30
0 100
0 268
1 239
0 334
0 247
1 370
0 73
0 312
3 439
0 547
2 654
0 904
2 898
3 952
3 2431
30 1100
2 1225
0 100
0 268
18 898
4 1225
0 30
0 73
3 239
0 247
14 439
6 334
0 312
1 547
4 370
2 904
5 952
4 654
7 1100
14 2431
0 30
0 247
0 268
0 239
0 370
1 654
0 547
0 904
0 952
0 100
0 334
2 439
0 312
17 1100
0 1225
0 73
1 898
0 2431
0 312
0 370
1 898
21 1100
1 654
0 30
0 904
0 73
0 100
0 247
0 239
0 268
0 334
2 439
0 547
0 952
0 1225
1 2431
0 30
0 100
0 73
0 247
1 312
4 334
10 439
0 268
3 239
1 547
4 370
4 654
1 904
7 952
4 1100
15 898
8 1225
7 2431
0 334
0 30
0 100
0 312
0 239
0 904
0 247
1 952
0 268
0 73
2 439
2 898
0 370
0 547
23 1100
0 1225
2 654
2 2431
0 100
0 73
5 370
0 547
9 654
7 904
0 30
0 247
0 268
3 312
2 334
2 239
8 439
11 952
19 898
13 1100
3 1225
17 2431
0 100
1 239
0 30
0 247
0 268
0 73
3 439
0 547
0 334
3 370
0 312
4 952
5 898
3 654
0 904
26 1100
1 1225
6 2431
0 30
0 73
0 100
1 268
1 239
0 247
0 312
0 334
3 439
0 370
1 547
1 654
0 952
4 898
3 904
23 1100
1 1225
3 2431
0 24

In [1]:
def stat_file_line_numbers(in_file):
    raw_return = os.popen("wc -l %s" % in_file)
    line_num = int(raw_return.read().split()[0])
    return line_num

def fisher_test(eqtl_bed, gwas_bed):
         #              gwas_bed       x
        # QTL_SNP         a       QTL_SNP - a
        # HapMap_SNP      c       HapMap_SNP - c
    qtl_list = pd.read_table(eqtl_bed, sep="\t", header=None).iloc[:, -1].str.split(";").str[2].tolist()
    gw_list = pd.read_table(gwas_bed, sep="\t", header=None).iloc[:, -1].tolist()
#     print(hapmap_list[0])
#     print(qtl_list[0])
#     print(gw_list[0])
    QTL_num = stat_file_line_numbers(eqtl_bed)
    a = len([x for x in qtl_list if x in gw_list])
    c = len(hapset.intersection(set(gw_list)))
    b, d = QTL_num - int(a), HapMap_NUMBER - int(c)
    print(a, c)
#     sys.exit(1)
#     oddsratio, pvalue = stats.fisher_exact([[int(a), b], [int(c), d]])
    oddsratio, pvalue = stats.fisher_exact([[int(a), b], [int(c), d]], alternative="less")
    pvalue = 1 - pvalue
    line="%s\t%s\t%f\t%f\t%d"%(os.path.basename(eqtl_bed).split("-")[0],os.path.basename(gwas_bed),oddsratio,pvalue,a)
    return line

def write_to_file(res_list, res_file):
    res_list = ["%s\n" % x.get() for x in res_list]
    with open(res_file, "w") as fw:
        fw.writelines(res_list)

#### FDR correction

In [11]:
def fdr_correction():
    result_dir = "/home/galaxy/project/QTL_analysis/related_with_disease/GWAS/FDR_correction"
    data_dir = "/home/galaxy/project/QTL_analysis/related_with_disease/GWAS/enrichment_result"
    file_list = glob.glob("%s/*.bed" % data_dir)
    for x in file_list:
        res_file = os.path.join(result_dir, os.path.basename(x).replace(".bed", ".txt"))
        df = pd.read_table(x, sep="\t", header=None, names=["tissue", "disease", "oddsratio", "pvalue", "BothPos"])
        pvalue_list = [float(x) for x in df.iloc[:, -2].tolist()]
        qvalue_list = bh_correction(pvalue_list)
        df["qvalue"] = qvalue_list
        df.to_csv(res_file, sep="\t", index=False)
fdr_correction()

In [8]:
def bh_correction(pvalue_list):
    true_or_false, qvalue_list = multitest.fdrcorrection(pvalue_list, method="indep")
    return qvalue_list