## Notebook for obtaining smFISH gene-by-cell

In [None]:
#general packages
import pandas as pd
import numpy as np
from pathlib import Path
from glob import glob
import tifffile as tf
from skimage.measure import regionprops

## Identify z with most spots detected. Make sure spots are already matched to respective z-slice.

In [None]:
#read in spots for channel 1
channel_1_best = []

for pos in range(55):
    spot_paths = glob(f"/groups/CaiLab/personal/Lex/raw/230810_43genes_smfish/pyfish_tools/output/dots_detected/Channel_1/spots_in_cells/Pos{pos}/*")
    counts = []
    if spot_paths == []:
        continue
    for path in spot_paths:
        counts.append(len(pd.read_csv(path)))
    highest_counts = np.argmax(counts)
    channel_1_best.append(spot_paths[highest_counts])

In [None]:
#read in spots for channel 2
channel_2_best = []

for pos in range(55):
    spot_paths = glob(f"/groups/CaiLab/personal/Lex/raw/230810_43genes_smfish/pyfish_tools/output/dots_detected/Channel_2/spots_in_cells/Pos{pos}/*")
    counts = []
    if spot_paths == []:
        continue
    for path in spot_paths:
        counts.append(len(pd.read_csv(path)))
    highest_counts = np.argmax(counts)
    channel_2_best.append(spot_paths[highest_counts])

## Convert mapped spots to actual gene names

In [None]:
#combine channels
channel_all = channel_1_best + channel_2_best

In [None]:
#read in csvs and attach cell and pos id
all_df = []
for path in channel_all:
    #grab pos 
    pos = Path(path).parent.name
    df = pd.read_csv(path).iloc[:,1:]
    z = df.z[0]
    df["cell id"] = "cell_" + df["cell number"].astype(str).values+ f"_{pos}_z{int(z)}"
    all_df.append(df)

In [None]:
#concat dfs
all_df = pd.concat(all_df).reset_index(drop=True)

In [None]:
#read in codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/230810_43genes_smfish/barcode_key/smfish_key.csv")

In [None]:
#remove useless hybs and channels
all_df = all_df[all_df.hyb < 22].reset_index(drop=True)
all_df = all_df[~((all_df.hyb == 21) & (all_df.ch == 2))].reset_index(drop=True)

In [None]:
#generate dictionary to convert channel and hyb info to 
codebook_map = {}
for gene, hyb, channel in codebook.values:
    codebook_map.update({(hyb,channel):gene})

In [None]:
#convert to gene names
genes = []
for hyb, ch in all_df[["hyb", "ch"]].values.astype(int):
    genes.append(codebook_map[(hyb,ch)])

In [None]:
#add in info
all_df_new = all_df.iloc[:,2:]
all_df_new.insert(0, "Genes", genes)

## Final gene by cell

In [None]:
genebycell = pd.pivot_table(all_df_new, columns = ["Genes"], 
                            index=["cell id"], aggfunc='size', fill_value=0)

In [None]:
genebycell

There will be some zeros if different channels had more spots on different z's. Merge same cells until after cell size normalizing.

## Normalize smFISH counts by cell size.

In [None]:
import tifffile as tf
from skimage.measure import regionprops

In [None]:
#edges deleted masks directory
mask_dir = Path("/groups/CaiLab/personal/Lex/raw/230810_43genes_smfish/pyfish_tools/output/edges_deleted/")
#obtain cell mask area
cell_area = []
for cell_id in genebycell.index:
    pos_info = int(cell_id.split("_")[2].replace("Pos",""))
    cell_info = int(cell_id.split("_")[1])
    z_info = int(cell_id.split("_")[-1].replace("z",""))
    mask = tf.imread(str(mask_dir / f"MMStack_Pos{pos_info}_z{z_info}.tif"))
    #get area per cell
    area_per_cell = []
    info = regionprops(mask)
    for cell in info:
        if cell_info == cell.label:
            cell_area.append(cell.area)
            break
        else:
            continue

In [None]:
#add cell area
genebycell["cell_area_pixels"] = cell_area

In [None]:
#divide all counts by cell area in pixels, then divide that value by actual pixel area in um
genebycell_norm = genebycell.div(genebycell["cell_area_pixels"].values, axis=0)/(0.108**2)
genebycell_norm = genebycell_norm.iloc[:,:-1]

In [None]:
#grab all cell and pos id
ids = genebycell_norm.index.str.split("_").str[:-1].str.join("_")

In [None]:
#merge z for same cells
df_list = []
for cell in np.unique(ids):
    #of the two z's just return highest value in column
    df = pd.DataFrame(genebycell_norm[genebycell_norm.index.str.startswith(cell)].max())
    df.columns = [cell]
    df_list.append(df.T)
final_df = pd.concat(df_list)

In [None]:
#take a look
final_df

In [None]:
final_df.to_csv("43gene_smfish_norm.csv")