# Dot dispersion of decoded transcripts

This notebook can be used to measure the general spread of spot distances used for decoding a transcript. This information may be useful when comparing different probe design methodoligies.

In [None]:
import tifffile as tf
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def used_dots_decoded_genes(df_locs_2d, df_locs_2d_2, img_2d,add_trace = True, zmax=1000):
    
    #For Plotting 2d image
    #-------------------------------------------
    fig = px.imshow(
        img_2d,
        width=700,
        height=700,
        binary_string=True,
        binary_compression_level=4,
        binary_backend='pil',
        zmax = zmax
    )
    #-------------------------------------------
    
    #For Plotting 2d dots
    #-------------------------------------------
    fig.add_trace(go.Scattergl(
        x=df_locs_2d.x,
        y=df_locs_2d.y,
        mode='markers',
        marker_symbol='circle',
        marker=dict(
            #maxdisplayed=1000,
            size=5, color = "blue",
            ),
        name = "Used"
        ))
    
    if add_trace == True:
        fig.add_trace(go.Scattergl(
            x=df_locs_2d_2.x,
            y=df_locs_2d_2.y,
            mode='markers',
            marker_symbol='circle',
            marker=dict(
                #maxdisplayed=1000,
                size=5, color="green"
                ),
            name = "Decoded On"
            ))
        
        fig.update_layout(legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01))
    
        
    fig.show()
    

In [None]:
def keep_dots_in_cells(mask, dot_locations):
    """
    A function to remove any dots outside of mask
    Parameter
    ---------
    mask = cellpose generated mask path
    dot_locations = dot_locations path
    """
    
    #read in data
    locations = pd.read_csv(dot_locations)
    #cellpose mask outputs (c,y,x)
    img = tf.imread(mask)
    #get x and y coordinates
    locations_xy = locations[["x","y"]].values.astype(int)
    dot_info = []
    #keep dots only in cells
    for i in range(len(locations)):
        x = locations_xy[i][0]
        y = locations_xy[i][1]
        if img[y,x] == 0:
            continue
        else:
            cell = img[y,x]
            dot_info.append([i,cell])
            
    dot_info = np.array(dot_info)
    
    #keep rows that have cells
    dots_in_cells = locations.loc[dot_info[:,0]]
    
    #add cell info
    dots_in_cells["cell number"] = dot_info[:,1]
    
    return dots_in_cells

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

from sklearn.neighbors import KDTree


def dot_displacement_from_gene(ref,dots, rounds=3, distance_cutoff=2):
    
    """
    A function to measure the distance between decoded transcripts and the dots that make up that decoded 
    transcript.
    
    Parameters
    -----------
    ref: decoded transcript locations with fakes filtered and ambiguity=0
    dots: locations of dots used in decoding mapped to cell masks
    distance_cutoff: the maximum search radius used in decoding
    """
    
    #initialize neighbor
    neigh = NearestNeighbors(n_neighbors=1, metric="euclidean", n_jobs=1)
    
    barcoding_round = []
    #separate locations by barcoding round
    hyb_rounds = np.arange(0, len(dots["hyb"].unique()),1)
    hybs = len(dots["hyb"].unique())
    temp = []
    for h in hyb_rounds:
        if h == hyb_rounds[len(hyb_rounds)-1]:
            barcode = dots[dots["hyb"] == h]
            temp.append(barcode)
            comp_round = pd.concat(temp)
            barcoding_round.append(comp_round) 
        elif (h % (hybs/rounds) != 0) or (h == 0):
            barcode = dots[dots["hyb"] == h]
            temp.append(barcode)
        else:
            comp_round = pd.concat(temp)
            barcoding_round.append(comp_round)
            temp = []
            barcode = dots[dots["hyb"] == h]
            temp.append(barcode)

    distance_per_round = []
    index_list = []
    for seed in barcoding_round:
        seed = seed.reset_index(drop=True)
        #initialize KDTree
        kdt = KDTree(seed[["x","y"]].values, leaf_size=40, metric='euclidean')
        #get nearest neighbor
        distance, index = kdt.query(ref[["x","y"]].values, k=1, return_distance=True)
        #get distances below cutoff
        distance = np.compress(distance.ravel()<distance_cutoff, distance.ravel())
        #add distance and index to list
        distance_per_round.append(distance)
        index_list.append(seed[["x","y"]].values[index][:,0])
    
    return distance_per_round, index_list

In [None]:
#read in one of the gene mapped locations file
locations = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/150genes3bind_040622/notebook_pyfiles/decoded/final_thresh8_11p52_33_heg/Channel_1/genes_in_cells/Pos_0/gene_locations.csv", index_col=0)
#get dots used
dots_used_locations = f"/groups/CaiLab/personal/Lex/raw/150genes3bind_040622/notebook_pyfiles/decoded/final_thresh8_11p52_33_heg/Channel_1/Pos_0/dots_used_z_0.csv"
mask = f"/groups/CaiLab/personal/Lex/raw/150genes3bind_040622/notebook_pyfiles/edges_deleted/MMStack_Pos0.tif"
#map to mask
dots_used_3bind = keep_dots_in_cells(mask, dots_used_locations)
#only isolate genes that had no other dot choices
location_noneighbors = locations[locations["ambiguity score"]==0]
#remove fakes
fakes = location_noneighbors[location_noneighbors["genes"].str.startswith("fake")]
location_noneighbors_3bind = location_noneighbors.drop(fakes.index)

In [None]:
img_2d = tf.imread(f"/groups/CaiLab/personal/Lex/raw/150genes3bind_040622/Fiducials/MMStack_Pos0.ome.tif")[0][2]
used_dots_decoded_genes(dots_used_3bind, location_noneighbors_3bind, img_2d,add_trace = True, zmax=1000)

In [None]:
#reset index
dots_used_3bind = dots_used_3bind.reset_index(drop=True)
#get dot displacement
distances_3bind, index = dot_displacement_from_gene(location_noneighbors_3bind,dots_used_3bind, rounds=3, distance_cutoff=2)

In [None]:
_min = min(min(distances_3bind[0]*100), min(distances_3bind[1]*100), min(distances_3bind[2]*100))
_max = max(max(distances_3bind[0]*100), max(distances_3bind[1]*100), max(distances_3bind[2]*100))

color = ["red","blue","green"]
for i in range(3):
    plt.hist(distances_3bind[i]*100, bins=20, range=(_min,_max), alpha=0.2, color = color[i], label=f"Round {i+1}")
plt.legend()
sns.despine()
plt.xlabel("Distance (nm)")
plt.ylabel("Counts")
plt.show()

In [None]:
#read in one of the gene mapped locations file
locations = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/150genes_040122/notebook_pyfiles/decoded/final_11p52_44_thresh2_fid_rem/Channel_1/genes_in_cells/Pos_0/gene_locations.csv", index_col=0)
#get dots used
dots_used_locations = f"/groups/CaiLab/personal/Lex/raw/150genes_040122/notebook_pyfiles/decoded/final_11p52_44_thresh2_fid_rem/Channel_1/Pos_0/dots_used_z_0.csv"
mask = f"/groups/CaiLab/personal/Lex/raw/150genes_040122/notebook_pyfiles/edges_deleted/MMStack_Pos0.tif"
#map dots
dots_used_2bind = keep_dots_in_cells(mask, dots_used_locations)
#isolate only genes with no other dot choices
location_noneighbors_2bind = locations[locations["ambiguity score"]==0]
#remove fakes
fakes = location_noneighbors_2bind[location_noneighbors_2bind["genes"].str.startswith("fake")]
location_noneighbors_2bind = location_noneighbors_2bind.drop(fakes.index)

In [None]:
dots_used_2bind = dots_used_2bind.reset_index(drop=True)

In [None]:
distances_2bind, index = dot_displacement_from_gene(location_noneighbors_2bind,dots_used_2bind, rounds=4, distance_cutoff=2)

In [None]:
_min = min(min(distances_2bind[0]*100), min(distances_2bind[1]*100), 
           min(distances_2bind[2]*100),min(distances_2bind[3]*100), )
_max = max(max(distances_2bind[0]*100), max(distances_2bind[1]*100), 
           max(distances_2bind[2]*100), max(distances_2bind[3]*100))

color = ["red","blue","green","orange"]
for i in range(4):
    plt.hist(distances_2bind[i]*100, bins=20, range=(_min,_max), alpha=0.2, color = color[i], label=f"Round {i+1}")
plt.legend()
sns.despine()
plt.xlabel("Distance (nm)")
plt.ylabel("Counts")
plt.show()

In [None]:
#compare 2 vs 3 binding site probes
color = ["red","blue","green","orange"]
for i in range(3):
    min_ = min(min(distances_2bind[i]*100), min(distances_3bind[i]*100))
    max_ = max(max(distances_2bind[i]*100), max(distances_3bind[i]*100))
    hist1, binedge1 = np.histogram(distances_2bind[i]*100, bins=20, range=(min_,max_)) 
    hist2, binedge2 = np.histogram(distances_3bind[i]*100, bins=20, range=(min_,max_)) 
    #normalize hist height 
    hist1 = hist1/len(distances_2bind[i])
    hist2 = hist2/len(distances_3bind[i])                                 
    plt.stairs(hist1, binedge1, fill=True, color = "blue", alpha=0.5, label=f"Round {i+1}: 2 binding site")
    plt.stairs(hist2, binedge2, fill=True, color = "green", alpha=0.5, label=f"Round {i+1}: 3 binding site")
    plt.legend()
    sns.despine()
    plt.xlabel("Distance (nm)")
    plt.ylabel("Proportion of dots")
    diff = np.mean(distances_2bind[i]*100)- np.mean(distances_3bind[i]*100)
    diff_var = np.var(distances_2bind[i]*100)- np.var(distances_3bind[i]*100)
    plt.annotate(f"Mean difference = {round(diff,2)} nm", (75,0.05))
    plt.annotate(f"Variance difference = {round(diff_var,2)} nm", (75,0.04))
    plt.show()