In [1]:
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
import pandas as pd
import scipy.stats
import math

In [57]:
from pyfasta import Fasta
import tqdm

In [4]:
def getNames(fastafile):
    f = Fasta(fastafile)
    return f.keys()

In [6]:
def constructGroundTruth(truelist,seq_names):
    count_tsv={}
    for name in seq_names:
        count_tsv[name] = 0
    with open(truelist) as fp:
        lines = fp.readlines()
        for line in lines:
            tr = line.strip().split("/")[-1]
            count_tsv[tr] += 1
    return count_tsv

In [5]:
seq_names = getNames("/mnt/scratch1/bleed_through_human/ref/transcript_clean_200K.fa")

In [59]:
def batchWriteTruth(types):
    for t in tqdm.tqdm(types):
        truth_list = "/".join(["/mnt/scratch1/bleed_through_human/poly_reads",t,"truth.list"])
        count_tsv = constructGroundTruth(truth_list, \
                                         seq_names)
        ground_df = pd.DataFrame(list(count_tsv.items()))
        ground_df.columns = ['transcript_id','count']
        ground_df.to_csv("/".join(["/mnt/scratch1/bleed_through_human/poly_reads/quant",t,"truth.tsv"]), \
                         sep="\t",index=False)

In [47]:
#ground_df.to_csv("/mnt/scratch1/bleed_through_human/poly_reads/quant/reads_101/truth.tsv",sep="\t",index=False)

In [51]:
##samplenames
types = []
import glob
fastq_files=glob.glob("/mnt/scratch1/bleed_through_human/poly_reads/*/*1.fasta.gz")
for f in fastq_files:
    dirname = f.split("/")[-2]
    types.append(dirname)

In [60]:
batchWriteTruth(types)

100%|██████████| 14/14 [07:14<00:00, 31.52s/it]


In [64]:
def getMergedDataFrameFast(typeof,files):
    truth = pd.read_table(files[typeof]["truth"], delim_whitespace=True, \
       usecols=["transcript_id", "count"])
    df = truth
    df.rename(index=str, \
            columns={"transcript_id": "Name", "count": "count"}, inplace = True)
    
    sla = pd.read_table(files[typeof]["SLA"], delim_whitespace=True, \
                                 usecols=["Name", "NumReads"])
    df = pd.merge(df,sla, how="outer", on = "Name").fillna(0.0)
    
    kallisto = pd.read_table(files[typeof]["kallisto"], delim_whitespace=True, \
                                     usecols=["target_id", "est_counts"])
    kallisto.rename(index=str, columns={"target_id": "Name", \
                                                "est_counts": "NumReads_KAL"}, inplace = True)
    df = pd.merge(df,kallisto, how="outer", on = "Name").fillna(0.0)
    
    hera = pd.read_table(files[typeof]["hera"], delim_whitespace=True, \
                                     usecols=["#target_id", "est_counts"])
    hera["#target_id"]= hera["#target_id"].str.split(":",expand=True)[0]
    hera.rename(index=str, columns={"#target_id": "Name", \
                                               "est_counts": "NumReads_hera"}, inplace = True)
    df = pd.merge(df,hera, how="outer", on = "Name").fillna(0.0)
    return (df,truth,sla,kallisto,hera)

In [61]:
prefix = "/mnt/scratch1/bleed_through_human/poly_reads/quant"
#types = ["reads_101"]
files = {}
for t in types:
    fileinfo = {}
    fileinfo["truth"]= "/".join([prefix,t,"truth.tsv"])
    fileinfo["SLA"] =  "/".join([prefix,t,"salmon_out","quant.sf"])
    fileinfo["kallisto"] = "/".join([prefix,t,"kallisto_out","abundance.tsv"])
    #fileinfo["bowtie"] = "/".join([prefix,t,"quant","bowtie_out","quant.sf"])
    fileinfo["hera"] = "/".join([prefix,t,"hera_out","abundance.tsv"])
    files[t] = fileinfo

In [66]:
result_df = pd.DataFrame(columns=['sample','kallisto', 'SLA', 'hera'])
for i,t in enumerate(types):
    df, truth, sla,kallisto,hera = getMergedDataFrameFast(t,files)
    kal=df["count"].corr(df["NumReads_KAL"],method="spearman")
    sla=df["count"].corr(df["NumReads"],method="spearman")
    hera=df["count"].corr(df["NumReads_hera"],method="spearman")
    result_df.loc[i] = [t,kal,sla,hera]
    #print("sample name: {}".format(t))
    #print("kallisto",df["count"].corr(df["NumReads_KAL"],method="spearman"))
    #print("SLA",df["count"].corr(df["NumReads"],method="spearman"))
    #print("BowTie2",df_30["count"].corr(df_30["NumReads_BT"],method="spearman"))
    #print("hera",df["count"].corr(df["NumReads_hera"],method="spearman"))

In [69]:
result_df

Unnamed: 0,sample,kallisto,SLA,hera
0,reads_120,0.838221,0.880553,0.858618
1,reads_183,0.844338,0.883299,0.769599
2,reads_57,0.840198,0.882222,0.835788
3,reads_8,0.84053,0.881696,0.814793
4,reads_89,0.844404,0.882497,0.835946
5,reads_101,0.841968,0.882108,0.846867
6,reads_114,0.841816,0.880559,0.855661
7,reads_174,0.838394,0.880393,0.914232
8,reads_198,0.84196,0.882128,0.819621
9,reads_137,0.840609,0.882348,0.796499
