In [1]:
import os
import sys
import shutil
import glob
import pandas as pd
import numpy as np
from scipy import stats
import subprocess
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from multiprocessing import Pool
from collections import Counter

In [22]:
data_dir = "/data5/galaxy/project/lncRNA_analysis/m6a_expression/expre_result"
raw_list = glob.glob("%s/*.txt" % data_dir)
result_dir = "/data5/galaxy/project/lncRNA_analysis/m6a_expression"
m6a_lncRNA_dir = "%s/m6a_lncRNA" % result_dir
unm6a_lncRNA_dir = "%s/unm6a_lncRNA" % result_dir
mRNA_dir = "%s/mRNA" % result_dir
#
mRNA_bed = "/data3/xs/tissue_m6a/2018.1/GRCh38_segment/mRNA.bed"
total_gene_dir = "/data5/galaxy/project/promoter_TF_enrich/data/total_gene/gene_bed"
total_gene_list = glob.glob("%s/*.bed" % total_gene_dir)
expre_file = "/data5/galaxy/project/expression/stringtie/Total-RPKM.txt"
#
# for x in raw_list:
#     class_lncRNA(x)
#
df_exp = pd.read_table(expre_file, sep="\t")
df_exp.columns = ["gene", "brain", "heart", "kidney", "liver", "placenta", "lung", "muscle", "stomach"]
df_exp["name"] = df_exp["gene"].str.split(".").str[0]
for gene_bed in total_gene_list:
    tissue = os.path.basename(gene_bed).split(".bed")[0]
    print(tissue)
    get_mRNA_expre(gene_bed, df_exp)

liver
stomach
placenta
lung
heart
brain
muscle
kidney


In [17]:
def class_lncRNA(in_file):
    df = pd.read_table(in_file, sep="\t")
#     print(df.head())
    df.columns = ["Gene_ID", "type", "expression"]
    df_m6a, df_unm6a = df[df["type"] == "m6a"], df[df["type"] == "unm6a"]
    df_m6a[["Gene_ID", "expression"]].to_csv(os.path.join(m6a_lncRNA_dir, "m6a-%s" % os.path.basename(in_file)), sep="\t", index=False)
    df_unm6a[["Gene_ID", "expression"]].to_csv(os.path.join(unm6a_lncRNA_dir, "unm6a-%s" % os.path.basename(in_file)), sep="\t", index=False)

In [21]:
def get_mRNA_expre(gene_bed, df_exp):
    tissue = os.path.basename(gene_bed).split(".")[0].lower()
    df_mRNA = read_bed(mRNA_bed)
    df_gene = read_bed(gene_bed)
    df = df_gene.merge(df_mRNA, on="name", how="right").merge(df_exp, on="name").dropna()
    df["Gene_ID"] = df["name"]
    df[["Gene_ID", tissue]].to_csv(os.path.join(mRNA_dir, "mRNA-%s" % os.path.basename(gene_bed).replace(".bed", ".txt")), sep="\t", index=False)

In [9]:
def read_bed(in_bed):
    df = pd.read_table(in_bed, sep="\s+", header=None, names=["chr", "start", "end", "name", "s", "strand"])
    df["name"] = df["name"].str.split(".").str[0]
    return df

In [29]:
base_dir = "/data5/galaxy/project/lncRNA_analysis/m6a_expression"
m6a_lncRNA_dir = "%s/m6a_lncRNA" % base_dir
unm6a_lncRNA_dir = "%s/unm6a_lncRNA" % base_dir
mRNA_dir = "%s/mRNA" % base_dir
db_file = "/data5/galaxy/project/lncRNA_analysis/m6a_expression/GO_BP.xls"
df = pd.read_excel(db_file, sheet_name="GO_BP")
print(df.head())
for i_dir in [m6a_lncRNA_dir, unm6a_lncRNA_dir, mRNA_dir]:
    print(i_dir)
    annotate_each_dir(i_dir, df)

   Gene ID Gene Symbol       Ensembl ID  GO Term ID  Gene Number in GO Term  \
0        1        A1BG  ENSG00000121410  GO:0008150                     588   
1        2         A2M  ENSG00000175899  GO:0001869                       2   
2        2         A2M  ENSG00000175899  GO:0002576                      81   
3        2         A2M  ENSG00000175899  GO:0007264                     320   
4        2         A2M  ENSG00000175899  GO:0007596                     463   

                                        GO Term Name  
0                                 biological_process  
1  negative regulation of complement activation, ...  
2                             platelet degranulation  
3          small GTPase mediated signal transduction  
4                                  blood coagulation  
/data5/galaxy/project/lncRNA_analysis/m6a_expression/m6a_lncRNA
/data5/galaxy/project/lncRNA_analysis/m6a_expression/unm6a_lncRNA
/data5/galaxy/project/lncRNA_analysis/m6a_expression/mRNA


In [28]:
def annotate_each_dir(i_dir, df):
    result_dir = "%s/GO_annotation" % i_dir
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    file_list = glob.glob("%s/symbol_*.txt" % i_dir)
    for x in file_list:
        result_file = os.path.join(result_dir, os.path.basename(x).replace("symbol_", "GO_"))
        df_query = pd.read_table(x, sep="\t")
        df_merge = df.merge(df_query, left_on="Gene Symbol", right_on="Gene_ID", how="right").dropna()
        df_merge.to_csv(result_file, sep="\t", index=False)