In [1]:
import os
import glob
import pandas as pd
import numpy as np
from multiprocessing import Pool

In [2]:
data_dir = "/home/galaxy/project/m6AQTL/data/rbp/postar/2019_6_20/human_RBP"
rbp_list = glob.glob("%s/*.binding.sites" % data_dir)
result_dir = '/home/galaxy/project/m6AQTL/data/rbp/postar/2019_6_20/bed_files/'

In [7]:
def main_method(rbp):
    print(os.path.basename(rbp))
    rbp_name, df = preprocess_file(rbp)
    df_result = df.groupby(["key_peak"]).apply(select_highest_score)
    raw_bed = generate_bed(rbp_name, df_result)
    merge_sites(raw_bed)

pool = Pool(processes=20)
for rbp in rbp_list:
    pool.apply_async(main_method, (rbp, ))
pool.close()
pool.join()

NOL12.binding.sites
EFTUD2.binding.sites
PRKRA.binding.sites
HNRNPD.binding.sites
RPS5.binding.sites
SAFB2.binding.sites
NKRF.binding.sites
HNRNPU.binding.sites
UCHL5.binding.sites
NCBP2.binding.sites
EIF4G2.binding.sites
NCBP3.binding.sites
QKI.binding.sites
YTHDC1.binding.sites
METTL3.binding.sites
ALKBH5.binding.sites
YTHDC2.binding.sites
FUBP3.binding.sites
EIF3H.binding.sites
LIN28A.binding.sites
METTL14.binding.sites
STAU1.binding.sites
LSM11.binding.sites
ZC3H7B.binding.sites
SLTM.binding.sites
CPSF3.binding.sites
IGF2BP1.binding.sites
HNRNPC.binding.sites
TROVE2.binding.sites
FAM120A.binding.sites
ILF3.binding.sites
TAF15.binding.sites
XPO5.binding.sites
RPS11.binding.sites
MTPAP.binding.sites
XRCC6.binding.sites
IGF2BP2.binding.sites
CNBP.binding.sites
FKBP4.binding.sites
DGCR8.binding.sites
RBM22.binding.sites
SRSF9.binding.sites
AARS.binding.sites
YBX3.binding.sites
DDX3X.binding.sites
ALKBH1.binding.sites
METAP2.binding.sites
FASTKD2.binding.sites
RBM27.binding.sites
NPM1.b

In [3]:
def preprocess_file(rbp):
    df = pd.read_table(rbp, sep="\t", header=None)
    df.columns = ["Chromosome", "Peak start", "Peak end", "Name", "1", "Strand", "RBP name", "CLIP-seq technology and peak calling method", "Cell line or tissue", "Data accession", "Score", "Target gene chromosome", "Target gene start", "Target gene end", "Target gene ID", "2", "Target gene strand", "Target gene type", "Genomic context", "Target gene name", "PhastCons score", "Phylop score"]
    rbp = df["RBP name"][0]
    df = df[["Chromosome", "Peak start", "Peak end", "CLIP-seq technology and peak calling method", "Cell line or tissue", "Data accession", "Target gene ID", "Target gene name", "Target gene type", "Score"]]
    df["Target gene"] = df["Target gene ID"].str.split("|").str[0] + "|" + df["Target gene name"]
    del df["Target gene ID"]
    del df["Target gene name"]
    df["key_peak"] = df["Chromosome"] + ":" + df["Peak start"].astype(str) + "-" + df["Peak end"].astype(str)
    del df["Chromosome"]
    del df["Peak start"]
    del df["Peak end"]
    df = df.drop_duplicates()
#     print(df.head())
    return rbp, df

In [4]:
def select_highest_score(df_sub):
    df_sub = df_sub.sort_values(["Score"])
    return df_sub.iloc[-1, :]

In [5]:
# transform into bed
def generate_bed(rbp_name, df_1):
    df_1["name"] = df_1[df_1.columns[:-1]].apply(lambda x: ';'.join(x.astype(str)),axis=1)
    df_1["chromosome"] = df_1["key_peak"].str.split(":").str[0]
    df_1["start"] = df_1["key_peak"].str.split(":").str[1].str.split("-").str[0]
    df_1["end"] = df_1["key_peak"].str.split("-").str[1]
    df_1["start"], df_1["end"] = df_1["start"].astype(int), df_1["end"].astype(int)
    df_2 = df_1[["chromosome", "start", "end", "name"]].sort_values(["chromosome", "start"])
    raw_bed = "/home/galaxy/project/m6AQTL/data/rbp/postar/2019_6_20/%s.bed" % rbp_name
    df_2.to_csv(raw_bed, sep="\t", index=False, header=False)
    return raw_bed

In [6]:
def merge_sites(raw_bed):
    final_bed = os.path.join(result_dir, os.path.basename(raw_bed))
    os.system('bedtools merge -i %s -c 4 -o collapse -delim "@" > %s' % (raw_bed, final_bed))
    if os.path.exists(final_bed):
        os.system("rm %s" % raw_bed)