In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from thalamus_merfish_analysis import ccf_plots as cplot
from thalamus_merfish_analysis import ccf_images as cimg
from thalamus_merfish_analysis import abc_load as abc

get_ipython().run_line_magic('matplotlib', 'inline') 

## Load brain3 data

### with thalamus-focused section-wise affine alignment

In [2]:
realigned=True

obs = abc.get_combined_metadata(realigned=True)
obs = abc.label_thalamus_spatial_subset(obs, realigned=realigned, filter_cells=True)
ccf_label = 'parcellation_substructure_realigned' if realigned else 'parcellation_substructure'
nn_classes = [
    "31 OPC-Oligo",
    "30 Astro-Epen",
    "33 Vascular",
    "34 Immune",
]
# subset just the neurons
obs_neurons = obs[~obs['class'].isin(nn_classes)]

In [3]:
th_names = abc.get_thalamus_substructure_names()
th_subregion_names = list(set(th_names).difference(['TH-unassigned']))


In [4]:
# define celltype lists based on strict spatial subset
# obs_th_neurons = obs_neurons[obs_neurons[ccf_label].isin(th_names)]
obs_th_neurons = obs_neurons[obs_neurons['parcellation_substructure_realigned'].isin(th_names) |
                             obs_neurons['parcellation_substructure'].isin(th_names)]
th_celltypes = dict()
th_celltypes['subclass'] = obs_th_neurons['subclass'].value_counts().loc[lambda x: x>100].index
print(f"{len(th_celltypes['subclass'])=}")

th_celltypes['supertype'] = obs_th_neurons['supertype'].value_counts().loc[lambda x: x>20].index
print(f"{len(th_celltypes['supertype'])=}")

th_celltypes['cluster'] = obs_th_neurons['cluster'].value_counts().loc[lambda x: x>10].index
print(f"{len(th_celltypes['cluster'])=}")

len(th_celltypes['subclass'])=29
len(th_celltypes['supertype'])=128
len(th_celltypes['cluster'])=448


## matching metrics

In [5]:
from sklearn.metrics import f1_score, precision_score, recall_score
    
def get_nucleus_celltype_metrics(obs, ccf_label, celltype_label, 
                                 ccf_list=None, celltype_list=None):
    if celltype_list is None:
        celltype_list = obs[celltype_label].unique()
    # else: obs = obs[obs[celltype_label].isin(celltype_list)]
    if ccf_list is None:
        ccf_list = obs[ccf_label].unique()
    # else: obs = obs[obs[ccf_label].isin(ccf_list)]
    # could subset like this, but need to track the negatives...
    
    records = []
    for celltype_name in celltype_list:
        celltype = obs[celltype_label] == celltype_name
        for ccf_name in ccf_list:
            nucleus = obs[ccf_label] == ccf_name
            tp = (nucleus & celltype).sum()
            fp = (~nucleus & celltype).sum()
            fn = (nucleus & ~celltype).sum()
            recall = tp/(tp+fn)
            precision = tp/(tp+fp)
            jaccard = tp/(tp+fp+fn)
            f1 = 2*recall*precision/(recall+precision)
            if precision>0.5 or recall>0.5 or f1>0.4:
                record = {
                    'nucleus': ccf_name,
                    'celltype': celltype_name,
                    'nucleus_precision': precision,
                    'nucleus_recall': recall,
                    'nucleus_f1': f1,
                    'jaccard':jaccard
                }
                records.append(record)
    return pd.DataFrame.from_records(records)

## results

In [6]:
subclass_df = get_nucleus_celltype_metrics(obs_neurons, ccf_label, 'subclass', 
                                           ccf_list=th_subregion_names,
                                           celltype_list=th_celltypes['subclass'])

  f1 = 2*recall*precision/(recall+precision)


In [7]:
subclass_set = subclass_df.query('nucleus_precision>0.6 & nucleus_recall>0.15')
subclass_set.sort_values('nucleus_f1', ascending=False)

Unnamed: 0,nucleus,celltype,nucleus_precision,nucleus_recall,nucleus_f1,jaccard
24,MH,145 MH Tac2 Glut,0.801533,0.979953,0.881809,0.788603
38,AV,148 AV Col27a1 Glut,0.867015,0.814956,0.84018,0.724406
23,RT,093 RT-ZI Gnb3 Gaba,0.641743,0.928089,0.758801,0.611345
41,AD,147 AD Serpinb7 Glut,0.660504,0.826498,0.734236,0.580074
42,LH,146 LH Pou4f1 Sox1 Glut,0.718574,0.593338,0.649979,0.481458


In [8]:
subclass_region_names = set(subclass_set['nucleus']).difference(['LGd-sh'])
nonsubclass_region_names = set(th_subregion_names).difference(subclass_region_names)

In [9]:
supertype_df = get_nucleus_celltype_metrics(obs_neurons, ccf_label, 'supertype', 
                                           ccf_list=nonsubclass_region_names,
                                           celltype_list=th_celltypes['supertype'])

  f1 = 2*recall*precision/(recall+precision)


In [10]:
supertype_set = supertype_df.query('nucleus_precision>0.6 & nucleus_recall>0.15')
supertype_set.sort_values('nucleus_f1', ascending=False)

Unnamed: 0,nucleus,celltype,nucleus_precision,nucleus_recall,nucleus_f1,jaccard
8,PF,0675 PF Fzd5 Glut_2,0.636688,0.637826,0.637257,0.467628
11,RE,0670 RE-Xi Nox4 Glut_3,0.757943,0.543722,0.633205,0.463277
9,LD,0658 TH Prkcd Grin2c Glut_5,0.639803,0.534702,0.58255,0.410985
7,MD,0663 TH Prkcd Grin2c Glut_10,0.608185,0.379051,0.467027,0.304655
13,ZI,0462 ZI Pax6 Gaba_4,0.790488,0.258132,0.389179,0.241603
24,PT,0648 PVT-PT Ntrk1 Glut_6,0.727575,0.162222,0.265294,0.152933


In [11]:
cluster_df = get_nucleus_celltype_metrics(obs_neurons, ccf_label, 'cluster', 
                                           ccf_list=nonsubclass_region_names,
                                           celltype_list=th_celltypes['cluster'])

  f1 = 2*recall*precision/(recall+precision)


In [12]:
cluster_set = cluster_df.query('nucleus_precision>0.6 & nucleus_recall>0.1')
cluster_set.sort_values('nucleus_f1', ascending=False)

Unnamed: 0,nucleus,celltype,nucleus_precision,nucleus_recall,nucleus_f1,jaccard
2,VPM,2649 TH Prkcd Grin2c Glut_1,0.658752,0.660979,0.659863,0.492385
18,AMd,2674 TH Prkcd Grin2c Glut_9,0.624071,0.487805,0.547588,0.37702
6,LD,2658 TH Prkcd Grin2c Glut_5,0.75091,0.423409,0.541492,0.371264
17,PT,2677 TH Prkcd Grin2c Glut_10,0.739255,0.382222,0.503906,0.336815
3,MD,2678 TH Prkcd Grin2c Glut_10,0.769601,0.367343,0.497311,0.330947
8,VM,2687 TH Prkcd Grin2c Glut_13,0.665842,0.360429,0.467691,0.305219
10,RE,2698 RE-Xi Nox4 Glut_3,0.813364,0.292582,0.430357,0.274175
5,PO,2663 TH Prkcd Grin2c Glut_6,0.746333,0.288176,0.415801,0.262468
14,PF,2723 PF Fzd5 Glut_2,0.703325,0.196639,0.307348,0.181578
15,PF,2724 PF Fzd5 Glut_2,0.708556,0.189489,0.299013,0.175788


# Metrics with whole-brain nonlinear alignment (original)

In [13]:
ccf_label = 'parcellation_substructure'

In [14]:
subclass_df = get_nucleus_celltype_metrics(obs_neurons, ccf_label, 'subclass', 
                                           ccf_list=th_subregion_names,
                                           celltype_list=th_celltypes['subclass'])

  f1 = 2*recall*precision/(recall+precision)


In [15]:
subclass_set = subclass_df.query('nucleus_precision>0.6 & nucleus_recall>0.15')
subclass_set.sort_values('nucleus_f1', ascending=False)

Unnamed: 0,nucleus,celltype,nucleus_precision,nucleus_recall,nucleus_f1,jaccard
27,MH,145 MH Tac2 Glut,0.937819,0.976713,0.956871,0.917309
26,RT,093 RT-ZI Gnb3 Gaba,0.836096,0.979011,0.901927,0.821373
41,AD,147 AD Serpinb7 Glut,0.870588,0.920071,0.894646,0.809375
42,LH,146 LH Pou4f1 Sox1 Glut,0.798311,0.751767,0.77434,0.631774
40,AV,148 AV Col27a1 Glut,0.703833,0.795798,0.746995,0.596163
30,ZI,101 ZI Pax6 Gaba,0.622503,0.540986,0.578889,0.40735


In [16]:
subclass_region_names = set(subclass_set['nucleus']).difference(['LGd-sh'])
nonsubclass_region_names = set(th_subregion_names).difference(subclass_region_names)

In [17]:
supertype_df = get_nucleus_celltype_metrics(obs_neurons, ccf_label, 'supertype', 
                                           ccf_list=nonsubclass_region_names,
                                           celltype_list=th_celltypes['supertype'])

  f1 = 2*recall*precision/(recall+precision)


In [18]:
supertype_set = supertype_df.query('nucleus_precision>0.6 & nucleus_recall>0.15')
supertype_set.sort_values('nucleus_f1', ascending=False)

Unnamed: 0,nucleus,celltype,nucleus_precision,nucleus_recall,nucleus_f1,jaccard
15,RE,0670 RE-Xi Nox4 Glut_3,0.630272,0.624857,0.627552,0.457251
12,LD,0658 TH Prkcd Grin2c Glut_5,0.678133,0.430846,0.526919,0.357698
10,MD,0663 TH Prkcd Grin2c Glut_10,0.623616,0.330901,0.432376,0.275816
20,PVT,0644 PVT-PT Ntrk1 Glut_2,0.707555,0.256932,0.376975,0.232267
21,PVT,0645 PVT-PT Ntrk1 Glut_3,0.660759,0.230973,0.342295,0.206487
25,PT,0648 PVT-PT Ntrk1 Glut_6,0.634551,0.161727,0.25776,0.147947


In [19]:
cluster_df = get_nucleus_celltype_metrics(obs_neurons, ccf_label, 'cluster', 
                                           ccf_list=nonsubclass_region_names,
                                           celltype_list=th_celltypes['cluster'])

  f1 = 2*recall*precision/(recall+precision)


In [20]:
cluster_set = cluster_df.query('nucleus_precision>0.6 & nucleus_recall>0.1')
cluster_set.sort_values('nucleus_f1', ascending=False)

Unnamed: 0,nucleus,celltype,nucleus_precision,nucleus_recall,nucleus_f1,jaccard
4,VPM,2649 TH Prkcd Grin2c Glut_1,0.625702,0.654472,0.639764,0.470333
9,LD,2658 TH Prkcd Grin2c Glut_5,0.841952,0.360912,0.505245,0.338012
18,PT,2677 TH Prkcd Grin2c Glut_10,0.666189,0.393734,0.494944,0.328854
13,RE,2698 RE-Xi Nox4 Glut_3,0.670507,0.333333,0.445295,0.286417
5,MD,2678 TH Prkcd Grin2c Glut_10,0.770477,0.313101,0.44526,0.286389
16,PF,2724 PF Fzd5 Glut_2,0.708556,0.262897,0.383502,0.237243
8,PO,2663 TH Prkcd Grin2c Glut_6,0.677262,0.242451,0.357074,0.21734
24,AMd,2676 TH Prkcd Grin2c Glut_9,0.601824,0.24937,0.352627,0.214054
14,PVT,2621 PVT-PT Ntrk1 Glut_2,0.77284,0.184661,0.298095,0.175154
10,LP,2664 TH Prkcd Grin2c Glut_6,0.649559,0.181574,0.283812,0.165374
