In [2]:
import os
import sys
import csv
import glob
import datetime
import shutil
import random
import subprocess
import pandas as pd
import numpy as np
from multiprocessing import Pool
pd.options.mode.chained_assignment = None  # default='warn'

#### Generate hg19 vcf format files, which are needed by m6aSNP

In [10]:
in_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/sQTL/sQTLSeekeR_Transcript-Ratio/hg19_m6a_qtl_bed"
in_list = glob.glob("%s/*.bed" % in_dir)
result_dir = "/home/galaxy/project/QTL_analysis/SNP_gain_or_loss/sQTL/sQTLSeekeR_Transcript-Ratio/vcf_formatted"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
for x in in_list:
    process_each_bed(x)

In [1]:
def process_each_bed(bed):
    result_vcf = os.path.join(result_dir, os.path.basename(bed))
    #
    tissue = os.path.basename(bed).split(".")[0]
    df = pd.read_table(bed, sep="\t", header=None, names=["chr", "start", "end", "info"])
    # ["new_chr", "new_pos", "variant_id", "ref", "alt", "sample"]
    df["RS ID"] = df["info"].str.split(";").str[2]
    df["Ref"] = df["info"].str.split(";").str[3]
    df["Alt"] = df["info"].str.split(";").str[4]
    df["Sample"] = tissue
    df_res = df[["chr", "end", "RS ID", "Ref", "Alt", "Sample"]].sort_values(["chr", "end"]).drop_duplicates()
    df_res.to_csv(result_vcf, sep="\t", header=False, index=False)
#     print(df_res.head())

#### m6ASNP predict

In [11]:
######################################################################################################################
m6aSNP_jar = "/data/software/m6ASNP/m6ASNP.jar"
eQTL_dir = "/home/galaxy/project/QTL_analysis/SNP_gain_or_loss/sQTL/sQTLSeekeR_Transcript-Ratio/vcf_formatted"
ucsc_annotation = "/data/software/m6ASNP/knownGeneAnnotation_hg19"
genome_sequence = "/data/software/m6ASNP/hg19.2bit"
result_dir = "/home/galaxy/project/QTL_analysis/SNP_gain_or_loss/sQTL/sQTLSeekeR_Transcript-Ratio/m6ASNP_result"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
###################################


def predict_single_file(eQTL_vcf):
    result_txt = os.path.join(result_dir, os.path.basename(eQTL_vcf).split(".bed")[0])
    os.system("java -jar %s -predict -i %s -it tab -sp Human -a %s -g %s -o %s" %
              (m6aSNP_jar, eQTL_vcf, ucsc_annotation, genome_sequence, result_txt))
    # java -jar /data/software/m6ASNP/m6ASNP.jar -predict -i liver.txt -it tab -sp Human -a /data/software/m6ASNP/knownGeneAnnotation_hg19 -g /data/software/m6ASNP/hg19.2bit -o test_result


if __name__ == '__main__':
    eQTL_vcf_list = glob.glob("%s/*.bed" % eQTL_dir)
    pool = Pool()
    for eQTL_vcf in eQTL_vcf_list:
        pool.apply_async(predict_single_file, (eQTL_vcf, ))
    pool.close()
    pool.join()

##### parse m6ASNP result and prepare input data for WebLogo

In [7]:
#############################################################################################################
data_dir = "/home/galaxy/project/QTL_analysis/SNP_gain_or_loss/pQTL/m6ASNP_result"
info_list = glob.glob("%s/*.txt" % data_dir)
# postfix = ".txt"
# m6a_dir = "/data6/revised_version/data/total_m6a_peak/hg19_version"
# m6a_list = glob.glob("%s/*.bed" % m6a_dir)
# tissue_list = ["BRAIN", "HEART", "LIVER", "LUNG", "MUSCLE", "STOMACH"]
result_dir = "/home/galaxy/project/QTL_analysis/SNP_gain_or_loss/pQTL/input_for_m6ASNP"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
# result_file = os.path.join(result_dir, "result_info_GRCh37.bed")
##############################################################################################################

In [8]:
for txt in info_list:
    for term in ["Functional Gain", "Functional Loss"]:
        process_each_file(txt, term)

In [6]:
def process_each_file(in_file, GAIN_OR_LOSS): # "Functional Gain" or "Functional Loss"
    prefix = os.path.basename(in_file).split(".")[0]
    ref_result = os.path.join(result_dir, "Ref-%s-%s.fa" % ("_".join(GAIN_OR_LOSS.split()), prefix))
    alt_result = os.path.join(result_dir, "Alt-%s-%s.fa" % ("_".join(GAIN_OR_LOSS.split()), prefix))
    names = ["SampleID", "UCSCID", "Gene symbol", "Chromosome", "Position", "Strand", "Reference sequence", "Mutation sequence", "Reference score", "Mutation score", "Significance", "Mutation event", "Related mutations", "unknown"]
    df = pd.read_table(in_file, sep="\t", header=None, skiprows=1, names=names)
#     print(df.head())
    df_fun = df[df["Mutation event"] == GAIN_OR_LOSS]
#     print(df_fun.head())
    df_fun["RS ID"] = df_fun["Related mutations"].str.split("|").str[-4]
    fw_ref, fw_alt = open(ref_result, 'w'), open(alt_result, 'w')
    for index, values in df_fun.iterrows():
        fw_ref.write(">%s\n%s\n" % (values["RS ID"], values["Reference sequence"]))
        fw_alt.write(">%s\n%s\n" % (values["RS ID"], values["Mutation sequence"]))
    fw_ref.close()
    fw_alt.close()