In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
from scipy.stats import spearmanr
import tifffile as tf
from false_positive_analysis import percent_false_positive
%config InlineBackend.figure_format='retina'

# Look at false positive rate

In [None]:
#read in threshold mtx
#do you want to look at deepcell dots or daostar
deepcell = True
channel = 4

if deepcell == False:
    thresh_mtx = []
    for i in range(11):
        ch_mtx = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/031322_11kgenes_experiment/notebook_pyfiles/genebycell/Threshold_{i}/comb_genebycell_{channel}.csv", index_col=0)
        #cells above 1000 total counts
        ch_mtx = ch_mtx.T
        ch_mtx = ch_mtx[ch_mtx.sum(axis=1)>1000].T
        thresh_mtx.append(ch_mtx)

else:
    mtx = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/031322_11kgenes_experiment/notebook_pyfiles/genebycell/min1/final/genebycell_{channel}.csv", index_col=0)
    mtx = mtx.T
    mtx = mtx[mtx.sum(axis=1)>3000].T

In [None]:
#read in rnaseq data
rnaseq_1 = pd.read_csv("./RNAseq_files/nih3t3_FPKM.csv")
rnaseq_2 = pd.read_csv("./RNAseq_files/kallisto_NIH3T3.csv")
#rnaseq= rnaseq[["tracking_id","E14J B1"]]
rnaseq_1.columns = ["Genes","FPKM"]

In [None]:
#codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/031322_11kgenes_experiment/barcode_key/codebook_string_488.csv", index_col=0)
#codebook.columns = ["Gene", "Brcd1", "Brcd2"]

In [None]:
#separate into true and false codebook
fakebook = codebook[codebook.index.str.startswith("fake")]
codebook = codebook.drop(fakebook.index)

In [None]:
#calculate false positive
if deepcell == False:
    fp_list = []
    for i in range(11):
        fp, fake, norm_fpr= percent_false_positive(thresh_mtx[i], codebook, fakebook)
        percent_fp = fp["FP raw"].mean()
        mean_counts = fp["total_counts"].mean()
        sum_counts = fp["total_counts"].sum()
        fp_list.append([percent_fp,norm_fpr,mean_counts,sum_counts])
else:
    fp_list = []
    fp, fake, norm_fpr= percent_false_positive(mtx, codebook, fakebook)
    percent_fp = fp["FP raw"].mean()
    mean_counts = fp["total_counts"].mean()
    sum_counts = fp["total_counts"].sum()
    fp_list.append([percent_fp,norm_fpr,mean_counts,sum_counts])

#conver to df
df_stats = pd.DataFrame(fp_list)
df_stats.columns = ["percent fp","false positive rate","mean counts", "total sum"]
df_stats

In [None]:
#plot fpr and percent falsepsotive
plt.plot(df_stats["percent fp"], label = "Percent False Positive")
plt.plot(df_stats["false positive rate"], label = "False Positive Rate")
plt.ylabel("Percent")
plt.xlabel("Thresholds tested")
sns.despine()
plt.legend()

In [None]:
#plot average counts
plt.plot(df_stats["mean counts"])
plt.ylabel("Mean Total Counts")
plt.xlabel("Thresholds tested")
sns.despine()
plt.legend()

# RNA seq correlations

In [None]:
# #choose desired threshold
# threshold = 6

# fp, fake, norm_fpr= percent_false_positive(thresh_mtx[threshold], codebook, fakebook)
fp, fake, norm_fpr= percent_false_positive(mtx, codebook, fakebook)

In [None]:
#convert data to pseudobulk rnaseq data
if deepcell == False:
    bulk = pd.DataFrame(thresh_mtx[threshold].mean(axis=1)).reset_index()
    bulk.columns = ["Genes", "Counts"]
else:
    bulk = pd.DataFrame(mtx.mean(axis=1)).reset_index()
    bulk.columns = ["Genes", "Counts"]

In [None]:
#rename rnaseq columns and lowercase
rnaseq_1["Genes"]=rnaseq_1["Genes"].str.lower()
rnaseq_2["gene"]=rnaseq_2["gene"].str.lower()

In [None]:
rnaseq_2.columns = ["Genes","TPM"]

In [None]:
#merge
comb_1 = pd.merge(rnaseq_1,bulk)

In [None]:
#pearson calc old rnaseq
linreg = linregress(x = comb_1["FPKM"], y = comb_1["Counts"])
pearsonr = linreg[2]
pearsonr = round(pearsonr,2)
pearsonr

In [None]:
comb_1.sort_values("Counts", ascending=False)

In [None]:
comb_1["Log Counts"] = np.log2(comb_1["Counts"]+0.1)
comb_1["Log FPKM"] = np.log2(comb_1["FPKM"]+0.1)

In [None]:
sns.set_style("white")
joint_kws=dict(gridsize=100)
hexplot = sns.jointplot(data=comb_1, x="Log Counts", y="Log FPKM", kind="hex",mincnt=0.1, 
              cmap="plasma", dropna=True, joint_kws=joint_kws)
plt.ylabel("Bulk RNAseq Log2(FPKM+0.1)", fontsize=12)
plt.xlabel("Pseudobulk Log2(Counts+0.1)", fontsize=12)
hexplot.ax_marg_x.remove()
hexplot.ax_marg_y.remove()
plt.annotate(f"Pearson's r= {pearsonr}", (-0.8,8.0), fontsize=12)
plt.title("Channel 488 nm", fontweight="bold")
sns.despine()

In [None]:
#read in smfish 
smfish = pd.read_csv("./smFISH_results.csv")

In [None]:
#convert to pseudobulk results 
smfish_df = pd.DataFrame(smfish.T.mean(axis=1))
smfish_df = smfish_df.reset_index()
smfish_df.columns = ["Genes", "smFISH Counts"]
smfish_df["Genes"] = smfish_df["Genes"].str.lower()

In [None]:
#combine smfish pseudobulk and seqfish+ pseudobulk results
comb_2 = pd.merge(smfish_df,bulk)

In [None]:
#pearson calc smfish results 
linreg = linregress(x = comb_2["smFISH Counts"], y = comb_2["Counts"])
pearsonr = linreg[2]
slope = linreg[0]
pearsonr = round(pearsonr,2)
print(f"smFISH correlation = {pearsonr}, efficiency = {round(slope,2)}")

In [None]:
#plot correlation and efficiency
plt.scatter(x = np.log2(comb_2["smFISH Counts"]), y = np.log2(comb_2["Counts"]))
plt.xlabel("Log2(Average smFISH Counts)", fontsize=12)
plt.ylabel("Log2(Average LANTERN Counts)", fontsize=12)
plt.ylim(0,4)
plt.annotate(f"Pearson's r= {pearsonr}", (-0.8,3.8), fontsize=12)
plt.annotate(f"Efficiency = {round(slope,2)}", (-0.8,3.5), fontsize=12)
plt.title("Channel 488 nm", fontweight="bold")
sns.despine()

# Dots per pixel

In [None]:
#import custom function
from neighbor_search_across import *
#import gen packages
import tifffile as tf
import pandas as pd

In [None]:
#get mask src
mask = "/groups/CaiLab/personal/Lex/raw/020422_20kdash_3t3/notebook_pyfiles/edges_deleted/MMStack_Pos0.ome.tif"
#get locations src
dot_locations = "/groups/CaiLab/personal/Lex/raw/020422_20kdash_3t3/notebook_pyfiles/dots_detected/Pos0/locations_z_0.csv"
#only keep dots in cells
dots = keep_dots_in_cells(mask, dot_locations)
#remove rehyb
dots = dots[dots["hyb"] != 12]
#sort by hyb
dots = dots.sort_values("hyb").reset_index(drop=True)
#remove random column
dots = dots.drop("Unnamed: 0", axis=1)

In [None]:
#take a look
dots

In [None]:
#calculate number of neighbors with expanding search radius
seeds= [0,1,2,3]
radii = np.linspace(1,6,20)

neighbor_list = []
for seed in seeds:
    for radius in radii:
        neighbors = neighbor_search(dots, hybs=12, num_barcodes=4, seed=seed, radius=radius)
        mean_neighbor = count_neighbors(neighbors)
        neighbor_list.append([seed,mean_neighbor,radius])

In [None]:
#convert to df
neighbor_df = pd.DataFrame(neighbor_list)

In [None]:
#take a look
neighbor_df

In [None]:
import matplotlib.pyplot as plt
#plot the average number of neighbors across rounds using different seeds with various radii
for seed in neighbor_df[0].unique():
    plt.plot(neighbor_df[neighbor_df[0]==seed][2],neighbor_df[neighbor_df[0]==seed][1], 
             label = f"Seed {seed}", linewidth=0.5)
plt.legend()
plt.xlabel("Radius in Pixels")
plt.ylabel("Average number of neighbors")

# Check clusters within barcoding round

In [None]:
#import custom function
from neighbor_search_within import *
#import ecdf plotting function
from ecdf import ecdf

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import tifffile as tf
import seaborn as sns
import pandas as pd

def plot_2d_locs_on_2d_image(df_locs_2d_1, df_locs_2d_2, img_2d,add_trace = True, zmax=1000):
    
    #For Plotting 2d image
    #-------------------------------------------
    fig = px.imshow(
        img_2d,
        width=700,
        height=700,
        binary_string=True,
        binary_compression_level=4,
        binary_backend='pil',
        zmax = zmax
    )
    #-------------------------------------------
    
    #For Plotting 2d dots
    #-------------------------------------------
    fig.add_trace(go.Scattergl(
        x=df_locs_2d_1.x,
        y=df_locs_2d_1.y,
        mode='markers',
        marker_symbol='cross',
        marker=dict(
            #maxdisplayed=1000,
            size=6
            ),
        name = "Round 1"
        )
    )
    if add_trace == True:
        fig.add_trace(go.Scattergl(
            x=df_locs_2d_2.x,
            y=df_locs_2d_2.y,
            mode='markers',
            marker_symbol='cross',
            marker=dict(
                #maxdisplayed=1000,
                size=6
                ),
            name = "Round 2"
            )
        )
    #-------------------------------------------
    
    fig.show()


In [None]:
#get mask src
mask = "/groups/CaiLab/personal/Lex/raw/020422_20kdash_3t3/notebook_pyfiles/edges_deleted/MMStack_Pos0.ome.tif"
#get dot src
dot_locations = "/groups/CaiLab/personal/Lex/raw/020422_20kdash_3t3/notebook_pyfiles/dots_detected/Pos0/locations_z_0.csv"
#keeps dots only in cells
dots = keep_dots_in_cells(mask, dot_locations)
#remove rehyb
locations = dots[dots["hyb"] != 12]

In [None]:
#separate into barcoding rounds
barcoding_round = []

#separate locations by barcoding round
hyb_rounds = np.arange(0,12,1)
temp = []
for h in hyb_rounds:
    if h == hyb_rounds[len(hyb_rounds)-1]:
        barcode = locations[locations["hyb"] == h]
        temp.append(barcode)
        comp_round = pd.concat(temp)
        barcoding_round.append(comp_round) 
    elif (h % (12/4) != 0) or (h == 0):
        barcode = locations[locations["hyb"] == h]
        temp.append(barcode)
    else:
        comp_round = pd.concat(temp)
        barcoding_round.append(comp_round)
        temp = []
        barcode = locations[locations["hyb"] == h]
        temp.append(barcode)

#remove temp list
del temp

In [None]:
#read in img
img = tf.imread("../aberration_corrected/HybCycle_0/MMStack_Pos0.ome.tif")

In [None]:
#plot dots on top of image
plot_2d_locs_on_2d_image(barcoding_round[0], df_locs_2d_2=None, img_2d=img[4],add_trace = False, zmax=1000)

In [None]:
#perform various radius searches and calculate dot density
neigh_list = []
index_list = []
radius_list = [1,2,3,4,5]
for radius in radius_list:
    neighbors,seed = neighbor_search_within(locations, hybs=12, num_barcodes=4, seed=0, radius=radius)
    density,index = density_estimate(neighbors, radius=radius, average=False)
    index_list.append(index)
    neigh_list.append(density)

In [None]:
#combine dfs
comb = pd.concat(neigh_list).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt
#plot distributions
ecdf.plot(comb, label_column = "radius", val_column = "number of neighbors/pixel", 
          conf = False, color = ["red","blue","green","purple","orange"])
plt.xlabel("# of neighbors/pixel")
plt.ylabel("ECDF")
plt.show()

In [None]:
#perform various radius searches and get average density
neigh_list = []
radius_list = np.linspace(1,5,20)
for radius in radius_list:
    neighbors,seed = neighbor_search_within(locations, hybs=12, num_barcodes=4, seed=0, radius=radius)
    density,index = density_estimate(neighbors, radius=radius, average=True)
    neigh_list.append(density)

In [None]:
#plot average density with expaning search radius
plt.plot(np.linspace(1,5,20), neigh_list)
sns.despine()
plt.xlabel("Search Radius in Pixels")
plt.ylabel("Average Neighbors/Pixel")

In [None]:
#get mask
mask = "/groups/CaiLab/personal/Lex/raw/020422_20kdash_3t3/notebook_pyfiles/edges_deleted/MMStack_Pos0.ome.tif"
#calculate area of mask
area_mask = np.sum(tf.imread(mask)>=1)

In [None]:
len(barcoding_round[0])/area_mask