In [1]:
import os
import sys
import shutil
import glob
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from multiprocessing import Pool
from collections import Counter

In [33]:
data_dir = "/data5/galaxy/project/lncRNA_analysis/m6a_expression"
gene_bed = "/data/database/hg19/genes_v28.bed"
df_bed = pd.read_table(gene_bed, sep="\t", header=None, names=["chr", "a", "b", "raw_name", "s", "stra"])
df_bed["name"] = df_bed["raw_name"].str.split(".").str[0]
# print(df_bed.head())
query_names = []
for i_file in glob.glob("%s/*.txt" % data_dir):
    print(i_file)
    t_names = read_file(i_file)
#     print(t_names)
    query_names = query_names + t_names
df_result = df_bed[df_bed["name"].isin(query_names)].dropna()
#
df_result["score"] = df_result["b"] - df_result["a"]
df_result = df_result[["chr", "a", "b", "raw_name", "score", "stra"]]
#
df_result.to_csv("/data/database/ucsc_phylo/hg38/primate/query_gene.bed", sep="\t", header=None, index=False)

/data5/galaxy/project/lncRNA_analysis/m6a_expression/brain.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/kidney.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/stomach.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/placenta.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/liver.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/lung.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/heart.txt
/data5/galaxy/project/lncRNA_analysis/m6a_expression/muscle.txt


In [28]:
def read_file(in_file):
    df = pd.read_table(in_file, sep="\t")
    return df["name"].tolist()

In [49]:
os.chdir("/data/database/ucsc_phylo/hg38/primate")
df = pd.read_table("Genes_ensembl_dot.bed", sep="\t", header=None, names=["chr", "a", "b", "name", "s", "stra"])
df["score"] = df["b"] - df["a"]
df = df[["chr", "a", "b", "name", "score", "stra"]].drop_duplicates()
counter, dup_names = Counter(df["name"].tolist()), []
for term in counter:
    if counter[term] >= 2:
        dup_names.append(term)
#         print(term, counter[term])
# for name in dup_names:
#     df_dup = df[df["name"].isin([name])]
#     print(df_dup)
#     print("#####################")
uniq_names = [x for x in df["name"].tolist() if x not in dup_names]
df_result = df[(df["chr"] != "chrY") | (df["name"].isin(uniq_names))]
print(len(df), len(dup_names), len(df_result))
df_result.to_csv("Genes_ensembl_dot_format.bed", sep="\t", header=None, index=False)

58288 45 58243


In [62]:
%%bash
cd /data/database/ucsc_phylo/hg38/vertebrate
bigWigAverageOverBed hg38.phyloP100way.bw ../primate/Genes_ensembl_dot_format.bed total_genes_result.tab

processing chromosomes


In [63]:
data_dir = "/data5/galaxy/project/lncRNA_analysis/m6a_expression"
phastcons_file = "/data/database/ucsc_phylo/hg38/vertebrate/total_genes_result.tab"
result_dir = data_dir
df_score = pd.read_table(phastcons_file, sep="\t", header=None, names=["n", "size", "covered", "sum", "mean0", "mean"])
df_score["name"] = df_score["n"].str.split(".").str[0]
del df_score["n"]
#
for x in glob.glob("%s/*_phastCons.txt" % data_dir):
    os.remove(x)
file_list = glob.glob("%s/*.txt" % data_dir)
# print(file_list)
df_list = []
for i_file in file_list:
    tissue = os.path.basename(i_file).split(".txt")[0].lower()
    df = pd.read_table(i_file, sep="\t")
    df.columns = ["name", "type", "FPKM"]
    df_merge = df_score.merge(df, on="name", how="right").dropna()
#     df_merge = df_merge[["name", "type", "FPKM", "size", "covered", "sum", "mean0", "mean"]]
#     df_merge = df_merge.sort_values(["type"])
    df_merge["tissue"] = tissue
    df_list.append(df_merge)
df_total = pd.concat(df_list)
df_total = df_total[["tissue", "name", "type", "FPKM", "size", "covered", "sum", "mean0", "mean"]]
df_total = df_total.sort_values(["tissue", "type"])
result_file = "%s/total-tissues_phastCons_vertebrate.txt" % result_dir
df_total.to_csv(result_file, sep="\t", index=False)