## Notebook for obtaining smFISH gene-by-cell

In [1]:
#general packages
import pandas as pd
import numpy as np
from pathlib import Path
from glob import glob
import sys
sys.path.append("..")
#custom function
from make_gene_by_cell import *

## Identify z with most spots detected. Make sure spots are already matched to respective z-slice.

In [2]:
#read in spots for channel 1
channel_1_best = []

for pos in range(55):
    spot_paths = glob(f"/groups/CaiLab/personal/Lex/raw/230726_43gene_smfish/pyfish_tools/output/dots_detected/Channel_1/spots_in_cells/Pos{pos}/*")
    counts = []
    if spot_paths == []:
        continue
    for path in spot_paths:
        counts.append(len(pd.read_csv(path)))
    highest_counts = np.argmax(counts)
    channel_1_best.append(spot_paths[highest_counts])

In [3]:
#read in spots for channel 2
channel_2_best = []

for pos in range(55):
    spot_paths = glob(f"/groups/CaiLab/personal/Lex/raw/230726_43gene_smfish/pyfish_tools/output/dots_detected/Channel_2/spots_in_cells/Pos{pos}/*")
    counts = []
    if spot_paths == []:
        continue
    for path in spot_paths:
        counts.append(len(pd.read_csv(path)))
    highest_counts = np.argmax(counts)
    channel_2_best.append(spot_paths[highest_counts])

## Convert mapped spots to actual gene names

In [23]:
#combine channels
channel_all = channel_1_best + channel_2_best

In [24]:
#read in csvs and attach cell and pos id
all_df = []
for path in channel_all:
    pos = Path(path).parent.name
    df = pd.read_csv(path).iloc[:,1:]
    df["cell id"] = "cell_" + df["cell number"].astype(str).values+ f"_{pos}"
    all_df.append(df)

In [25]:
#concat dfs
all_df = pd.concat(all_df).reset_index(drop=True)

In [26]:
#read in codebook
codebook = pd.read_csv("/groups/CaiLab/personal/Lex/raw/230726_43gene_smfish/barcode_key/smfish_key.csv")

In [27]:
#remove useless hybs and channels
all_df = all_df[all_df.hyb < 22].reset_index(drop=True)
all_df = all_df[~((all_df.hyb == 21) & (all_df.ch == 2))].reset_index(drop=True)

In [28]:
#generate dictionary to convert channel and hyb info to 
codebook_map = {}
for gene, hyb, channel in codebook.values:
    codebook_map.update({(hyb,channel):gene})

In [29]:
#convert to gene names
genes = []
for hyb, ch in all_df[["hyb", "ch"]].values.astype(int):
    genes.append(codebook_map[(hyb,ch)])

In [30]:
#add in info
all_df_new = all_df.iloc[:,2:]
all_df_new.insert(0, "Genes", genes)

## Final gene by cell

In [32]:
genebycell = pd.pivot_table(all_df_new, columns = ["Genes"], 
                            index=["cell id"], aggfunc='size', fill_value=0)

In [33]:
genebycell

Genes,AATF,ACTN1,B3GALT4,BGN,CALM1,CBL,CBLB,CCND1,CDK1,CDKN1A,...,SLC25A3,SMARCA4,SMARCB1,SOX10,SPARC,TOMM34,TRIM8,VANGL1,VANGL2,VCP
cell id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cell_10_Pos1,17,85,0,32,159,19,2,5,29,4,...,311,36,5,0,15,17,27,10,8,100
cell_10_Pos14,32,155,0,484,174,21,17,8,236,3,...,355,63,2,0,10,11,36,10,7,123
cell_10_Pos15,51,159,1,562,122,20,6,42,152,3,...,466,64,1,0,10,15,38,12,13,132
cell_10_Pos26,44,107,0,300,104,10,5,9,4,1,...,273,43,3,0,53,12,21,10,6,120
cell_10_Pos27,20,213,1,440,77,13,10,10,12,1,...,276,33,2,1,21,6,43,11,4,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cell_9_Pos38,17,114,0,188,222,15,22,9,214,5,...,285,51,6,0,13,17,47,7,3,143
cell_9_Pos40,33,104,3,303,209,13,37,17,41,0,...,305,41,1,0,14,16,56,12,7,143
cell_9_Pos45,37,171,1,404,228,18,10,23,59,3,...,387,41,2,1,22,17,66,8,13,155
cell_9_Pos48,33,121,2,235,146,11,15,5,95,0,...,244,36,1,0,15,7,24,6,4,165


In [34]:
genebycell.to_csv("43gene_counts_mtx.csv")