In [None]:
#general packages
import pandas as pd
import numpy as np
from collections import Counter
import tifffile as tf
from skimage.measure import regionprops
#plotting packages
import matplotlib.pyplot as plt
import seaborn as sns
#custom function
from post_analysis import *
%config InlineBackend.figure_format='retina'

In [None]:
#read in final mtx
#grab all channels
all_channels = []
all_channels_den = []

for c in [3]:
    mtx = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/genebycell/final_222_33_heg_svm_0p15_diff1_fdr10_test/final/genebycell_{c}.csv", index_col=0)
    #mtx = mtx.T
    #mtx = mtx[mtx.sum(axis=1)>100].T
    all_channels.append(mtx)

for c in [3]:
    mtx_den = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/genebycell/final_222_33_heg_svm_0p15_diff1_fdr10_test/final/gene_density_{c}.csv", index_col=0)
    all_channels_den.append(mtx_den)

mtx = pd.concat(all_channels, axis=0).fillna(0)
mtx_den = pd.concat(all_channels_den, axis=0).fillna(0)
# mtx = mtx.T
# mtx = mtx[mtx.sum(axis=1)>300].T

In [None]:
#take a look
mtx

In [None]:
#read in rnaseq data
rnaseq = pd.read_csv("./RNAseq_files/NIH3T3_CCS_TPM_REP1.csv")
rnaseq.columns = ["Genes","TPM"]

In [None]:
#codebook
codebooks = ["codebook_string_647.csv","codebook_string_561.csv","codebook_string_488.csv"]
codebook = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/decoding_files/SVM_Feature_Radial_Decoding/codebook_converter/{codebooks[c-1]}", index_col=0)
#separate into true and false codebook
fakebook = codebook[codebook.index.str.startswith("fake")]
codebook = codebook.drop(fakebook.index)

In [None]:
#calculate fdr
fp, fake = percent_false_positive(mtx, codebook, fakebook)
percent_fp = fp["FP raw"].mean()
mean_counts = fp["total_real"].mean()
sum_counts = fp["total_counts"].sum()
norm_fpr = fp["FDR"].mean()
fp_list = [percent_fp,norm_fpr,mean_counts,sum_counts]

In [None]:
#take a look at fdr results
df_stats = pd.DataFrame(fp_list).T
df_stats.columns = ["percent fp","false positive rate","mean true counts", "total sum"]
df_stats

In [None]:
#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(mtx.mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]
bulk["Genes"] = bulk["Genes"].str.lower()
rnaseq["Genes"] = rnaseq["Genes"].str.lower()
#merge
comb_1 = pd.merge(rnaseq,bulk)
#pearson's correlation
r = pearsonr(comb_1["TPM"],comb_1["Counts"])
r = round(r[0],2)

In [None]:
#get log2 + 1
comb_1["Log Counts"] = np.log10(comb_1["Counts"]+0.1)
comb_1["Log TPM"] = np.log10(comb_1["TPM"]+0.1)

In [None]:
#RNA-seq plot
sns.set_style("white")
joint_kws=dict(gridsize=50)
hexplot = sns.jointplot(data=comb_1, x="Log TPM", y="Log Counts", kind="hex",mincnt=0.1, 
              cmap="plasma", dropna=True, joint_kws=joint_kws)
plt.xlabel("Bulk RNAseq Log10(TPM+0.1)", fontsize=12)
plt.ylabel("Pseudobulk Log10(Counts+0.1)", fontsize=12)
hexplot.ax_marg_x.remove()
hexplot.ax_marg_y.remove()
plt.annotate(f"Pearson's r= {r}", (-1.0,1.6), fontsize=12)
plt.title("All Channels", fontweight="bold")
plt.colorbar()
sns.despine()
plt.show()

In [None]:
#read in smfish and other reference files
#smfish = pd.read_csv("./nih3t3_smfish/smFISH_results.csv").T
smfish = pd.read_csv("./nih3t3_smfish/smfish_27gene_custom_thresh_2.csv", index_col=0)
_150genes = pd.read_csv("./nih3t3_smfish/150genes_diff0.csv", index_col=0)
smfish_density = pd.read_csv("./nih3t3_smfish/27gene_smfish_density.csv", index_col=0)
_150genes_density = pd.read_csv("./nih3t3_smfish/150_genes_density.csv", index_col=0)

In [None]:
#27 gene correlation
correlation(mtx,smfish, label_x="smFISH", label_y="LANTERN",
            title="All Channels", cell_size_normalized=False, 
            return_comb_df=False, log=False)

In [None]:
#27 gene correlation
correlation(mtx,smfish, label_x="smFISH", label_y="LANTERN",
            title="All Channels", cell_size_normalized=False, 
            return_comb_df=False, log=True)

In [None]:
#150 gene correlation
correlation(mtx,_150genes, label_x="150 genes", label_y="LANTERN",
            title="All Channels", cell_size_normalized=False, 
            return_comb_df=False, log=False)

In [None]:
#150 gene correlation
correlation(mtx,_150genes, label_x="150 genes", label_y="LANTERN",
            title="All Channels", cell_size_normalized=False, 
            return_comb_df=False, log=True)

In [None]:
#27 gene density correlation
correlation(mtx_den,smfish_density, label_x="smFISH", label_y="LANTERN",
            title="All Channels", cell_size_normalized=True, 
            return_comb_df=False, log=False)

In [None]:
#150 gene density correlation
correlation(mtx_den,_150genes_density, label_x="150 genes", label_y="LANTERN",
            title="All Channels", cell_size_normalized=True, 
            return_comb_df=False, log=False)

# Percent decoded

In [None]:
#get average percent decoded
percent_decoded_list = []
for i in range(52):
    for z in range(1):
        try:
            src = f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/decoded/final_222_33_heg_svm_0p15_diff1_fdr10/Channel_3/Pos_{i}/percent_decoded_z_{z}.txt"
            with open(src) as f:
                decoded = f.readlines()[0].split(" ")[-1]
                f.close()
                percent_decoded_list.append(float(decoded))
        except FileNotFoundError:
            continue

In [None]:
np.mean(percent_decoded_list)

# Identify problematic hybs

In [None]:
#collapse into gene counts
counts_df = Counter(fake["genes"])
#change to df
counts_df = pd.DataFrame.from_dict(counts_df, orient='index').reset_index()
counts_df

In [None]:
#check codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/150genes3bind_040622/barcode_key/codebook_647nm.csv", index_col=0)

In [None]:
#lowercase gene names
codebook.index = codebook.index.str.lower()

In [None]:
#get top 30
maj_fakes = counts_df.sort_values(0, ascending=False).head(30)

In [None]:
#get troubled hybs
troubled_hybs = []
for fakes in maj_fakes["index"]:
    troubled_hybs.append(codebook.loc[fakes].values.tolist())

In [None]:
#convert to array
hybstocheck = np.array(troubled_hybs)

In [None]:
#take a look
hybstocheck

In [None]:
#get mode for each round
bad_hybs = []
for i in range(len(hybstocheck[0])):
    vals,counts = np.unique(hybstocheck[:,i], return_counts=True)
    index = np.argmax(counts)
    bad_hyb = vals[index]
    bad_hybs.append(bad_hyb)

print(f"Following hybs by rounds are problematic:{bad_hybs}" )

# Random checks

In [None]:
#compare undefineds
percent_und_list = []
channel=3
for i in range(25):
    current_und = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/decoded/final_222_33_heg_svm_0p15_diff1_fdr10/Channel_{channel}/Pos_{i}/dots_used_undefined_z_0.csv")
    current_und = current_und[current_und["hyb"] < 48]
    current_loc = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/dots_detected/Channel_{channel}/genes_in_cells/Pos{i}/locations_z_0.csv")
    current_loc = current_loc[current_loc["hyb"] < 48]
    current_per = len(current_und)/len(current_loc)
    
    old_und = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/decoded/final_11p52_33_heg_svm_0p15_diff1_fdr10/Channel_{channel}/Pos_{i}/dots_used_undefined_z_0.csv")
    old_und = old_und[old_und["hyb"] <48]
    old_loc = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/dots_detected/Channel_{channel}/genes_in_cells/Pos{i}/locations_z_0.csv")
    old_loc = old_loc[old_loc["hyb"] <48]
    old_per = len(old_und)/len(old_loc)
    
    diff = current_per-old_per
    
    percent_und_list.append(diff)

In [None]:
np.mean(percent_und_list)*100

In [None]:
#compare fakes
percent_fake_list = []
channel=3
for i in range(25):
    current_fake = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/decoded/final_222_33_heg_svm_0p15_diff1_fdr10/Channel_{channel}/Pos_{i}/dots_used_fakes_z_0.csv")
    current_fake = current_fake[current_fake["hyb"] < 48]
    current_loc = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/dots_detected/Channel_{channel}/genes_in_cells/Pos{i}/locations_z_0.csv")
    current_loc = current_loc[current_loc["hyb"] < 48]
    current_per = len(current_fake)/len(current_loc)
    
    old_fake = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/decoded/final_11p52_33_heg_svm_0p15_diff1_fdr10/Channel_{channel}/Pos_{i}/dots_used_fakes_z_0.csv")
    old_fake = old_fake[old_fake["hyb"] <48]
    old_loc = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/06122022_4kgenes/notebook_pyfiles/dots_detected/Channel_{channel}/genes_in_cells/Pos{i}/locations_z_0.csv")
    old_loc = old_loc[old_loc["hyb"] <48]
    old_per = len(old_fake)/len(old_loc)
    
    diff = current_per-old_per
    
    percent_fake_list.append(diff)

In [None]:
np.mean(percent_fake_list)