In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
from scipy.stats import spearmanr
from false_positive_analysis import percent_false_positive
%config InlineBackend.figure_format='retina'

In [None]:
#read in threshold mtx
thresh_mtx = []
for i in range(11):
    ch_mtx = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/112221_20kdash_3t3/notebook_pyfiles/genebycell/daostar/Threshold_{i}/genebycell.csv", index_col=0)
    thresh_mtx.append(ch_mtx)

In [None]:
#read in rnaseq data
rnaseq_1 = pd.read_csv("nih3t3_FPKM.csv")
rnaseq_2 = pd.read_csv("kallisto_NIH3T3.csv")
#rnaseq= rnaseq[["tracking_id","E14J B1"]]
rnaseq_1.columns = ["Genes","FPKM"]

In [None]:
#codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/112221_20kdash_3t3/barcode_key/corrected_codebook.csv")
codebook.columns = ["Gene", "Brcd1", "Brcd2"]
#fakebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/2020-08-08-takei/barcode_key/channel_1_fake.csv")

In [None]:
fakebook = codebook[codebook["Gene"].str.startswith("fake")]
codebook = codebook.drop(fakebook.index)

In [None]:
fp_list = []
for i in range(11):
    fp, fake, norm_fpr= percent_false_positive(thresh_mtx[i], codebook, fakebook)
    percent_fp = fp["FP raw"].mean()
    mean_counts = fp["total_counts"].mean()
    sum_counts = fp["total_counts"].sum()
    fp_list.append([percent_fp,norm_fpr,mean_counts,sum_counts])

In [None]:
df_stats = pd.DataFrame(fp_list)
df_stats.columns = ["percent fp","false positive rate","mean counts", "total sum"]

In [None]:
df_stats

In [None]:
plt.plot(df_stats["percent fp"], label = "Percent False Positive")
plt.plot(df_stats["false positive rate"], label = "False Positive Rate")
plt.ylabel("Percent")
plt.xlabel("Thresholds tested")
plt.legend()

In [None]:
plt.plot(df_stats["mean counts"])
plt.ylabel("Mean Total Counts")
plt.xlabel("Thresholds tested")
plt.legend()

In [None]:
dot_list = []
for i in range(11):
    for z in range(2):
        dots = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/112221_20kdash_3t3/notebook_pyfiles/dots_comb/channels_combined_daostar/Threshold_{i}/locations_z_{z}.csv")
        dot_list.append(dots)

In [None]:
total_dots = []
i = 0
for _ in range(11):
    total_dots.append((len(dot_list[i])/4) + (len(dot_list[i+1])/4))
    i += 2

In [None]:
plt.plot(df_stats["total sum"].values / np.array(total_dots))
plt.ylabel("Percent Decoded")
plt.xlabel("Thresholds tested")
print(df_stats["total sum"].values / np.array(total_dots))

In [None]:
fp, fake, norm_fpr= percent_false_positive(thresh_mtx[3], codebook, fakebook)

In [None]:
len(fakebook)

In [None]:
fp["total_real"].mean()

In [None]:
#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(thresh_mtx[3].mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]

In [None]:
rnaseq_2.columns = ["Genes", "TPM"]

In [None]:
rnaseq_1["Genes"]=rnaseq_1["Genes"].str.lower()
rnaseq_2["Genes"]=rnaseq_2["Genes"].str.lower()

In [None]:
comb_1 = pd.merge(rnaseq_1,bulk)
comb_2 = pd.merge(rnaseq_2,bulk)

In [None]:
sort_comb_1 = comb_1.sort_values(["FPKM"])
sort_comb_2 = comb_2.sort_values(["TPM"])

In [None]:
#pearson calc old rnaseq
linreg = linregress(x = sort_comb_1["FPKM"], y = sort_comb_1["Counts"])
pearsonr = linreg[2]
pearsonr = round(pearsonr,2)
pearsonr

In [None]:
#pearson calc new rnaseq
linreg = linregress(x = sort_comb_2["TPM"], y = sort_comb_2["Counts"])
pearsonr = linreg[2]
pearsonr = round(pearsonr,2)
pearsonr

In [None]:
plt.scatter(np.log10(sort_comb_1["Counts"]),np.log10(sort_comb_1["FPKM"]), s = 5, alpha=0.5)
plt.ylabel("Bulk RNAseq Log10(FPKM)", fontsize=12)
plt.xlabel("Pseudobulk Log10(Counts)", fontsize=12)
plt.xticks(fontsize=12, rotation=0)
plt.yticks(fontsize=12, rotation=0)
plt.annotate(f"Pearson's r= {pearsonr}", (-1.5,1.5), fontsize=12)
sns.despine()