In [1]:
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
import pandas as pd
import scipy.stats
import math

In [2]:
def getMergedDataFrameFast(typeof):
    truth = pd.read_table(files[typeof]["truth"], delim_whitespace=True, \
       usecols=["transcript_id", "count"])
    df = truth
    df.rename(index=str, \
            columns={"transcript_id": "Name", "count": "count"}, inplace = True)
    
    sla = pd.read_table(files[typeof]["SLA"], delim_whitespace=True, \
                                 usecols=["Name", "NumReads"])
    df = pd.merge(df,sla, how="outer", on = "Name").fillna(0.0)
    
    kallisto = pd.read_table(files[typeof]["kallisto"], delim_whitespace=True, \
                                     usecols=["target_id", "est_counts"])
    kallisto.rename(index=str, columns={"target_id": "Name", \
                                                "est_counts": "NumReads_KAL"}, inplace = True)
    df = pd.merge(df,kallisto, how="outer", on = "Name").fillna(0.0)
    
    hera = pd.read_table(files[typeof]["hera"], delim_whitespace=True, \
                                     usecols=["#target_id", "est_counts"])
    hera["#target_id"]= hera["#target_id"].str.split(":",expand=True)[0]
    hera.rename(index=str, columns={"#target_id": "Name", \
                                               "est_counts": "NumReads_hera"}, inplace = True)
    df = pd.merge(df,hera, how="outer", on = "Name").fillna(0.0)
    return (df,truth,sla,kallisto,hera)

## Result on whole transcriptome

In [12]:
prefix = "/mnt/scratch1/bleed_through_human"
types = ["bleed_through_200K"]
files = {}
for t in types:
    fileinfo = {}
    fileinfo["truth"]= "/".join([prefix,t,"rsem_sim","sim.sim.isoforms.results"])
    fileinfo["SLA"] =  "/".join([prefix,t,"quant","salmon_out","quant.sf"])
    fileinfo["kallisto"] = "/".join([prefix,t,"quant","kallisto_out","abundance.tsv"])
    #fileinfo["bowtie"] = "/".join([prefix,t,"quant","bowtie_out","quant.sf"])
    fileinfo["hera"] = "/".join([prefix,t,"quant","hera_out","abundance.tsv"])
    files[t] = fileinfo

In [13]:
df, truth, sla,kallisto,hera = getMergedDataFrameFast("bleed_through_200K")

In [14]:
print("kallisto",df["count"].corr(df["NumReads_KAL"],method="spearman"))
print("SLA",df["count"].corr(df["NumReads"],method="spearman"))
#print("BowTie2",df_30["count"].corr(df_30["NumReads_BT"],method="spearman"))
print("hera",df["count"].corr(df["NumReads_hera"],method="spearman"))

('kallisto', 0.74505572204396908)
('SLA', 0.81987109742375885)
('hera', 0.78666705404644033)


## Result on sampled GTF + transcriptome

In [15]:
prefix = "/mnt/scratch1/bleed_through_human"
types = ["bleed_through_hera"]
files = {}
for t in types:
    fileinfo = {}
    fileinfo["truth"]= "/".join([prefix,t,"rsem_sim","sim.sim.isoforms.results"])
    fileinfo["SLA"] =  "/".join([prefix,t,"quant","salmon_out","quant.sf"])
    fileinfo["kallisto"] = "/".join([prefix,t,"quant","kallisto_out","abundance.tsv"])
    #fileinfo["bowtie"] = "/".join([prefix,t,"quant","bowtie_out","quant.sf"])
    fileinfo["hera"] = "/".join([prefix,t,"quant","hera_out","abundance.tsv"])
    files[t] = fileinfo

In [16]:
df, truth, sla,kallisto,hera = getMergedDataFrameFast("bleed_through_hera")

In [17]:
print("kallisto",df["count"].corr(df["NumReads_KAL"],method="spearman"))
print("SLA",df["count"].corr(df["NumReads"],method="spearman"))
#print("BowTie2",df_30["count"].corr(df_30["NumReads_BT"],method="spearman"))
print("hera",df["count"].corr(df["NumReads_hera"],method="spearman"))

('kallisto', 0.75366744405042541)
('SLA', 0.842216148287251)
('hera', 0.7572425813340482)
