In [None]:
#general packages
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from collections import Counter
#plotting packages
import matplotlib.pyplot as plt
import seaborn as sns
#custom function
from false_positive_analysis import percent_false_positive
%config InlineBackend.figure_format='retina'

In [None]:
#read in final mtx
#grab all channels
all_channels = []

for c in [1]:
    mtx = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/050222_150genes4binding/notebook_pyfiles/genebycell/final_11p52_22_heg_svm_0p20/final/genebycell_{c}.csv", index_col=0)
    mtx = mtx.T
    mtx = mtx[mtx.sum(axis=1)>100].T
    #mtx = mtx[mtx.sum(axis=1)>1]
    all_channels.append(mtx)

mtx = pd.concat(all_channels, axis=1).fillna(0)

In [None]:
#take a look
mtx

In [None]:
#read in rnaseq data
rnaseq = pd.read_csv("./RNAseq_files/nih3t3_FPKM_CCS.csv")
rnaseq.columns = ["Genes","FPKM"]

In [None]:
#codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/050222_150genes4binding/notebook_pyfiles/decoding_files/SVM_Feature_Radial_Decoding/codebook_converter/codebook_string_488.csv", index_col=0)
#separate into true and false codebook
fakebook = codebook[codebook.index.str.startswith("fake")]
codebook = codebook.drop(fakebook.index)

In [None]:
#calculate fdr
fp, fake, norm_fpr= percent_false_positive(mtx, codebook, fakebook)
percent_fp = fp["FP raw"].mean()
mean_counts = fp["total_counts"].mean()
sum_counts = fp["total_counts"].sum()
fp_list = [percent_fp,norm_fpr,mean_counts,sum_counts]

In [None]:
#take a look at fdr results
df_stats = pd.DataFrame(fp_list).T
df_stats.columns = ["percent fp","false positive rate","mean counts", "total sum"]
df_stats

In [None]:
#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(mtx.mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]
bulk["Genes"] = bulk["Genes"].str.lower()
rnaseq["Genes"] = rnaseq["Genes"].str.lower()
#merge
comb_1 = pd.merge(rnaseq,bulk)
#pearson's correlation
r = pearsonr(comb_1["FPKM"],comb_1["Counts"])
r = round(r[0],2)

In [None]:
#get log2 + 1
comb_1["Log Counts"] = np.log2(comb_1["Counts"]+1)
comb_1["Log FPKM"] = np.log2(comb_1["FPKM"]+1)

In [None]:
#plot rnaseq correlations
plt.plot(comb_1["Log Counts"], comb_1["Log FPKM"], 'bo', alpha=0.5)
plt.title("Channel 647 nm", fontweight="bold")
plt.ylabel("Bulk RNAseq Log2(FPKM+1)", fontsize=12)
plt.xlabel("Pseudobulk Log2(Counts+1)", fontsize=12)
plt.annotate(f"Pearson's r= {round(r,2)}", (0.5,10), fontsize=12)
sns.despine()
plt.show()

In [None]:
#read in smfish 
#smfish = pd.read_csv("./nih3t3_smfish/smFISH_results.csv")
smfish = pd.read_csv("./nih3t3_smfish/smfish_27gene_custom_thresh_2.csv", index_col=0)

In [None]:
smfish

In [None]:
#convert to pseudobulk results 
#smfish_df = pd.DataFrame(smfish.T.mean(axis=1))
smfish_df = pd.DataFrame(smfish.mean(axis=1))
smfish_df = smfish_df.reset_index()
smfish_df.columns = ["Genes", "smFISH Counts"]
smfish_df["Genes"] = smfish_df["Genes"].str.lower()

In [None]:
#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(mtx.mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]
bulk["Genes"] = bulk["Genes"].str.lower()
#merge
comb_2 = pd.merge(smfish_df,bulk)
#calculate info
x = comb_2["smFISH Counts"].values
x_t = np.vstack([x, np.zeros(len(x))]).T
y = comb_2["Counts"].values
m,c = np.linalg.lstsq(x_t, y, rcond=None)[0]
r = pearsonr(x,y)[0]

In [None]:
#show smfish correlation
plt.plot(x, y, 'bo', alpha=0.5)
plt.plot(x, x*m, c = "k")
plt.title("Channel 647 nm", fontweight="bold")
plt.xlabel("Average smFISH counts")
plt.ylabel("Average LANTERN Counts")
plt.annotate(f"Pearson's r= {round(r,2)}", (0,230), fontsize=12)
plt.annotate(f"Efficiency = {round(m,2)}", (0,210), fontsize=12)
sns.despine()
plt.show()

In [None]:
#plot correlation and efficiency
plt.scatter(x = np.log2(comb_2["smFISH Counts"]), y = np.log2(comb_2["Counts"]), c="b", alpha=0.5)
plt.xlabel("Log2(Average smFISH Counts)", fontsize=12)
plt.ylabel("Log2(Average LANTERN Counts)", fontsize=12)
plt.annotate(f"Pearson's r= {round(r,2)}", (-1,8), fontsize=12)
plt.annotate(f"Efficiency = {round(m,2)}", (-1,7.3), fontsize=12)
plt.title("Channel 647 nm", fontweight="bold")
sns.despine()

# Percent decoded

In [None]:
#get average percent decoded
percent_decoded_list = []
for i in range(64):
    try:
        src = f"/groups/CaiLab/personal/Lex/raw/050222_150genes4binding/notebook_pyfiles/decoded/final_11p52_33_heg_svm_0p25_diff0/Channel_1/Pos_{i}/percent_decoded_z_0.txt"
        with open(src) as f:
            decoded = f.readlines()[0].split(" ")[-1]
            f.close()
            percent_decoded_list.append(float(decoded))
    except FileNotFoundError:
        continue

In [None]:
np.mean(percent_decoded_list)

# Identify problematic hybs

In [None]:
#collapse into gene counts
counts_df = Counter(fake["genes"])
#change to df
counts_df = pd.DataFrame.from_dict(counts_df, orient='index').reset_index()
counts_df

In [None]:
#check codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/150genes3bind_040622/barcode_key/codebook_647nm.csv", index_col=0)

In [None]:
#lowercase gene names
codebook.index = codebook.index.str.lower()

In [None]:
#get top 30
maj_fakes = counts_df.sort_values(0, ascending=False).head(30)

In [None]:
#get troubled hybs
troubled_hybs = []
for fakes in maj_fakes["index"]:
    troubled_hybs.append(codebook.loc[fakes].values.tolist())

In [None]:
#convert to array
hybstocheck = np.array(troubled_hybs)

In [None]:
#take a look
hybstocheck

In [None]:
#get mode for each round
bad_hybs = []
for i in range(len(hybstocheck[0])):
    vals,counts = np.unique(hybstocheck[:,i], return_counts=True)
    index = np.argmax(counts)
    bad_hyb = vals[index]
    bad_hybs.append(bad_hyb)

print(f"Following hybs by rounds are problematic:{bad_hybs}" )