In [None]:
#general packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
import tifffile as tf
from scipy.stats import pearsonr
from tqdm import tqdm
#custom function
from false_positive_analysis import percent_false_positive
%config InlineBackend.figure_format='retina'

# Look at false positive rate

In [None]:
#read in threshold mtx
channel = 4

thresh_mtx = []
for threshold in range(11):
    ch_mtx = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/031322_11kgenes_experiment/notebook_pyfiles/genebycell/screen_3_3_comb/Threshold_{threshold}/comb_genebycell_{channel}.csv", index_col=0)
    #cells above 1000 total counts
    ch_mtx = ch_mtx.T
    ch_mtx = ch_mtx[ch_mtx.sum(axis=1)>1000].T
    thresh_mtx.append(ch_mtx)

In [None]:
#read in rnaseq data
rnaseq_1 = pd.read_csv("./RNAseq_files/nih3t3_FPKM.csv")
rnaseq_2 = pd.read_csv("./RNAseq_files/kallisto_NIH3T3.csv")
rnaseq_1.columns = ["Genes","FPKM"]

In [None]:
codebook_list = ["codebook_string_750.csv","codebook_string_647.csv", "codebook_string_561.csv", "codebook_string_488.csv"]
#codebook
codebook = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/031322_11kgenes_experiment/barcode_key/{codebook_list[channel-1]}", index_col=0)

In [None]:
#separate into true and false codebook
fakebook = codebook[codebook.index.str.startswith("fake")]
codebook = codebook.drop(fakebook.index)

In [None]:
#calculate false positive
fp_list = []
for i in range(11):
    fp, fake, norm_fpr= percent_false_positive(thresh_mtx[i], codebook, fakebook)
    percent_fp = fp["FP raw"].mean()
    mean_counts = fp["total_counts"].mean()
    sum_counts = fp["total_counts"].sum()
    fp_list.append([percent_fp,norm_fpr,mean_counts,sum_counts])

#conver to df
df_stats = pd.DataFrame(fp_list)
df_stats.columns = ["percent fp","false positive rate","mean counts", "total sum"]
df_stats

In [None]:
#plot fpr and percent falsepsotive
plt.plot(df_stats["percent fp"]*100, label = "Percent False Positive")
plt.plot(df_stats["false positive rate"]*100, label = "False Positive Rate")
plt.ylabel("Percent")
plt.xlabel("Thresholds tested")
sns.despine()
plt.legend()

In [None]:
#plot average counts
plt.plot(df_stats["mean counts"])
plt.ylabel("Mean Total Counts")
plt.xlabel("Thresholds tested")
sns.despine()
plt.legend()

# Percent decoded by cells

In [None]:
#get average percent decoded
percent_decoded_list = []
for i in range(52):
    for z in range(1):
        try:
            src = f"/groups/CaiLab/personal/Lex/raw/060322_4kgenes/notebook_pyfiles/decoded/final_11p52_33_heg_svm_0p10_diff0_fdr10_seed2/Channel_3/Pos_{i}/percent_decoded_z_{z}.txt"
            with open(src) as f:
                decoded = f.readlines()[0].split(" ")[-1]
                f.close()
                percent_decoded_list.append(float(decoded))
        except FileNotFoundError:
            continue

In [None]:
np.mean(percent_decoded_list)

# RNA seq correlations screen

In [None]:
#rename rnaseq columns and lowercase
rnaseq_1["Genes"]=rnaseq_1["Genes"].str.lower()
rnaseq_2["gene"]=rnaseq_2["gene"].str.lower()
rnaseq_2.columns = ["Genes","TPM"]

In [None]:
#check rna-seq correlations for all
r_list = []
for threshold in range(11):
    #convert data to pseudobulk rnaseq data
    bulk = pd.DataFrame(thresh_mtx[threshold].mean(axis=1)).reset_index()
    bulk.columns = ["Genes", "Counts"]
    #merge
    comb_1 = pd.merge(rnaseq_1,bulk)
    #pearson's correlation
    r = pearsonr(comb_1["FPKM"],comb_1["Counts"])
    r = round(r[0],2)
    r_list.append(r)

In [None]:
#plot percent decoded looking at cells only
plt.plot(np.array(r_list))
plt.ylabel("Pearson's r correlations with RNA-seq")
plt.xlabel("Thresholds tested")
sns.despine()
plt.show()

# smFISH correlations

In [None]:
def get_corr_and_eff(df):
    """Fits line forcing intercept to zero. Use non logged data."""
    x = df["smFISH Counts"].values
    x_t = np.vstack([x, np.zeros(len(x))]).T
    y = df["Counts"].values
    m,c = np.linalg.lstsq(x_t, y, rcond=None)[0]
    r = pearsonr(x,y)[0]
    
    return m,c,r

In [None]:
#read in smfish 
smfish = pd.read_csv("./smFISH_results.csv")

In [None]:
#convert to pseudobulk results 
smfish_df = pd.DataFrame(smfish.T.mean(axis=1))
smfish_df = smfish_df.reset_index()
smfish_df.columns = ["Genes", "smFISH Counts"]
smfish_df["Genes"] = smfish_df["Genes"].str.lower()

In [None]:
#check rna-seq correlations for all
corr_list = []
for threshold in range(11):
    #convert data to pseudobulk rnaseq data
    bulk = pd.DataFrame(thresh_mtx[threshold].mean(axis=1)).reset_index()
    bulk.columns = ["Genes", "Counts"]
    #merge
    comb_2 = pd.merge(smfish_df,bulk)
    #get correlations and eff
    m,c,r = get_corr_and_eff(comb_2)
    corr_list.append([m,c,r])
    
corr_df = pd.DataFrame(corr_list) 
corr_df.columns = ["slope","intecept","pearson's r"]

In [None]:
corr_df

In [None]:
#plot r info
plt.plot(corr_df["pearson's r"])
plt.ylabel("Pearson's r with smFISH")
plt.xlabel("Thresholds tested")
sns.despine()
plt.show()

In [None]:
#plot eff info
plt.plot(corr_df["slope"])
plt.ylabel("Efficiency with smFISH")
plt.xlabel("Thresholds tested")
sns.despine()
plt.show()

# Look at best

In [None]:
#what is the best thresold?
best_thresh = 4

#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(thresh_mtx[best_thresh].mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]
#merge
comb_1 = pd.merge(rnaseq_1,bulk)
#pearson's correlation
r = pearsonr(comb_1["FPKM"],comb_1["Counts"])
r = round(r[0],2)
r_list.append(r)

In [None]:
comb_1["Log Counts"] = np.log2(comb_1["Counts"]+0.1)
comb_1["Log FPKM"] = np.log2(comb_1["FPKM"]+0.1)

In [None]:
sns.set_style("white")
joint_kws=dict(gridsize=100)
hexplot = sns.jointplot(data=comb_1, x="Log Counts", y="Log FPKM", kind="hex",mincnt=0.1, 
              cmap="plasma", dropna=True, joint_kws=joint_kws)
plt.ylabel("Bulk RNAseq Log2(FPKM+0.1)", fontsize=12)
plt.xlabel("Pseudobulk Log2(Counts+0.1)", fontsize=12)
hexplot.ax_marg_x.remove()
hexplot.ax_marg_y.remove()
plt.annotate(f"Pearson's r= {r}", (-1.0,8.4), fontsize=12)
plt.title("Channel 488 nm", fontweight="bold")
sns.despine()

In [None]:
#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(thresh_mtx[best_thresh].mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]
#merge
comb_2 = pd.merge(smfish_df,bulk)
#calculate info
x = comb_2["smFISH Counts"].values
x_t = np.vstack([x, np.zeros(len(x))]).T
y = comb_2["Counts"].values
m,c = np.linalg.lstsq(x_t, y, rcond=None)[0]
r = pearsonr(x,y)[0]

In [None]:
#show smfish correlation
plt.plot(x, y, 'bo')
plt.plot(x, x*m, c = "k")
plt.title("Channel 488 nm", fontweight="bold")
plt.xlabel("Average smFISH counts")
plt.ylabel("Average LANTERN Counts")
plt.annotate(f"Pearson's r= {round(r,2)}", (0,30), fontsize=12)
plt.annotate(f"Efficiency = {round(m,2)}", (0,28), fontsize=12)
sns.despine()
plt.show()

In [None]:
# #plot correlation and efficiency
# plt.scatter(x = np.log2(comb_2["smFISH Counts"]), y = np.log2(comb_2["Counts"]))
# plt.xlabel("Log2(Average smFISH Counts)", fontsize=12)
# plt.ylabel("Log2(Average LANTERN Counts)", fontsize=12)
# plt.ylim(0,5)
# plt.annotate(f"Pearson's r= {round(r,2)}", (-0.8,4.8), fontsize=12)
# plt.annotate(f"Efficiency = {round(m,2)}", (-0.8,4.5), fontsize=12)
# plt.title("Channel 750 nm", fontweight="bold")
# sns.despine()