In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import itertools
import numpy as np
import pandas as pd
import scanpy as sp

#distribution of distances between 2 types instead of just observation frequency

filepaths = ['/home/imaging_mfish/MERSCOPENAS04_data/human/atlas/merfish_output/202204251522_H1930002Cx58V10200710104_VMSC02501/processed/202204251522_H1930002Cx58V10200710104_VMSC02501.h5ad']
cell_h5ad = sp.read_h5ad(filepaths[0]) 
#obs has class, obsm spatial has location. IDs are matched (so spatial 0 = obs 0)
cell_data = cell_h5ad.obs
cell_data[['x', 'y']] = cell_h5ad.obsm['spatial']
#TODO:
# double check that allxall is 1e7 cells total, not 1e10 (if 10+ then use knn for storage, otherwise doesn't matter)

In [17]:
import timeit

#uniformly random values within 50um circle
radius = np.random.uniform(low=0, high=50, size=10)  # varies within 50
theta = np.random.uniform(low=0, high=2*np.pi, size=10)  # angle
rand_circle_points = pd.DataFrame({'x': np.sqrt(radius) * np.cos(theta), 'y':np.sqrt(radius) * np.sin(theta)})
rand_circle_points.loc[len(rand_circle_points)] = [1, 1] #add base case

#create all combinations of subclasses
subclasses = list(cell_data.groupby('subclass').groups.keys())[0:2]
unique_subset_pairs = list(itertools.combinations(subclasses, 2))
print(unique_subset_pairs)
#iterate over pairs, then over circle point multipliers to get all possible pairwise distance groupings
def for_timeit(rand_circle_points, cell_data):
    obs_freqs = {}
    for class_pair in unique_subset_pairs:
        container_df = pd.DataFrame()
        pair_iters = [[class_pair[0], class_pair[1]], [class_pair[0], class_pair[0]], [class_pair[1], class_pair[1]]]
        for group in pair_iters:
            group_name = str(group[0])+','+str(group[1])
            for point in rand_circle_points.itertuples(index=False):
                clust_dists_pair = np.tril(pairwise_distances(cell_data[cell_data['subclass']==group[0]][['x', 'y']].to_numpy()*point[0], 
                                                         cell_data[cell_data['subclass']==group[1]][['x', 'y']].to_numpy()*point[1], 
                                                         n_jobs=2)) 
                clust_dists_pair[clust_dists_pair==0] = np.nan
                pair_col = group_name+'_'+str(round(point[0], 2))+','+str(round(point[1], 2))
                pair_df = pd.DataFrame(pd.Series(clust_dists_pair.flatten()[:]), columns = [pair_col])
                container_df = pd.concat([container_df, pair_df], axis=1) 
            # calculate one-tailed z-test of observed vs randomized
            group_df = container_df.loc[:, container_df.columns.str.startswith(group_name)]
            group_df = group_df[group_df.transform(lambda x: x <= 15)].dropna(how='all')
            sample_colname = [x for x in list(group_df.columns) if '_1.0,1.0' in x][0]
            sample = group_df.loc[:, group_df.columns == sample_colname].count()
            population = group_df.loc[:, group_df.columns != sample_colname].count()
            # z = (sample mean – population mean) / [population standard deviation/sqrt(n)]
            z = (sample - np.mean(population)) / (np.std(population)/np.sqrt(10))
            obs_freqs[''.join(group)] = z
    container_df.dropna(how='all')
    return container_df, obs_freqs

# t = timeit.Timer(lambda: for_timeit(rand_circle_points, cell_data))
# print(t.timeit(1))
container_df, obs_freqs = for_timeit(rand_circle_points, cell_data)
print(obs_freqs)
display(container_df)

[('Astro', 'Chandelier')]
{'AstroChandelier': Astro,Chandelier_1.0,1.0    1.941676
dtype: float64, 'AstroAstro': Astro,Astro_1.0,1.0    6.847235
dtype: float64, 'ChandelierChandelier': Chandelier,Chandelier_1.0,1.0    34.082326
dtype: float64}


Unnamed: 0,"Astro,Chandelier_-1.37,-0.02","Astro,Chandelier_3.59,1.15","Astro,Chandelier_2.12,2.48","Astro,Chandelier_0.3,-0.61","Astro,Chandelier_-0.51,6.93","Astro,Chandelier_-5.16,-1.54","Astro,Chandelier_-2.98,5.08","Astro,Chandelier_-0.94,-5.85","Astro,Chandelier_0.56,0.4","Astro,Chandelier_-2.74,2.2",...,"Chandelier,Chandelier_3.59,1.15","Chandelier,Chandelier_2.12,2.48","Chandelier,Chandelier_0.3,-0.61","Chandelier,Chandelier_-0.51,6.93","Chandelier,Chandelier_-5.16,-1.54","Chandelier,Chandelier_-2.98,5.08","Chandelier,Chandelier_-0.94,-5.85","Chandelier,Chandelier_0.56,0.4","Chandelier,Chandelier_-2.74,2.2","Chandelier,Chandelier_1.0,1.0"
0,11627.434388,22648.087882,1275.224363,6899.814698,53146.070748,33452.564123,61262.261029,33091.601161,2026.092885,38998.913612,...,17129.223512,2526.564219,6438.843796,52370.033497,25510.374231,56722.282355,34522.211273,1152.587049,34812.188723,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4791716,,,,,,,,,,,...,,,,,,,,,,
4791717,,,,,,,,,,,...,,,,,,,,,,
4791718,,,,,,,,,,,...,,,,,,,,,,
4791719,,,,,,,,,,,...,,,,,,,,,,


### test/scratch below

cell-cell interactions arising from somatic contact or paracrine signaling, which can be inferred based on contact or proximity between cells that occurred with higher frequency than by random chance (29, 30). 

We examined whether these potential cell-cell interactions were cell-type specific. To this end, we only considered cell types at the subclass level and determined the frequency with which cell-cell contacts (or proximity) were observed between two subclasses of cells. 

Two neighboring cells were considered contacting (or in proximity) if their centroid distance was <15 μm, which is approximately the size of the soma of a single cell in both human and mouse cortex (31). 

We determined how much this frequency was above random chance and how significant this difference was by comparing the observed contact frequency with the expected contact frequencies from spatial permutations. 

To avoid artifacts arising from the laminar organization of cells and spatial variation of cells density (namely, cell types in the regions of higher cell density or with similar laminar distributions can result in a higher contact frequency by random chance), we designed our spatial permutations to only disrupt the spatial relationship between neighboring cells while still preserving the laminar distribution and local density of each cell type


![image.png](attachment:image.png)
Fig S10: 
(A) Schematic of spatial permutation test that determines the significance of interactions between cell types. Two cells were considered contacting if their nucleus centroids were within 15 μm in the imaging plane, which is approximately the size of the cell body of a single neuron. Contact frequency between any two cell types was determined as the observed frequency. Then, spatial localization of each cell was randomized within a radius of 50 μm, unless otherwise mentioned. Expected contact frequency between any two cell types was determined in each permutation and such permutation was performed 1,000 times to obtain the distribution of expected contact frequencies. The significance of observed contact frequency was calculated using one-tailed z-test and P-values were corrected to FDR (false discovery rate) using Benjamini-Hochberg Procedure. 
(B) Spatial map of L2/3 IT cells in a human MTG slice. (Left) Measured spatial map. (Middle and right) Two example spatial maps after spatial permutations described in (A). 
(C) Cortical depth distributions of L2/3 IT cells (light blue) and other cells (light red) in the human MTG slice shown in (B). (Left) Measured cortical depth distributions. (Middle and right) Cortical depth distributions after two example spatial permutations described in (A).

Summary: 
- For Each Z plane, for each specimen
    - Grab all cells within said plane
    - Find all sets of individual cells that are within 15um of one another (maybe go cell by cell? Is there a faster way?)
    - group by cell type and calculate observed frequency of cell type - cell type interaction
    - randomize spatial location of all cells by 50um and recalculate observed frequency (save two of these)
    - do so 1000 times to get a distribution
    - calculate one-tailed z-test of observed vs randomized. Benjamini-Hochberg goes here?
    - plot spatial map of specific cell types of interest pre randomization and post 2 randomizations (this will probably involve saving two randomizations and creating new h5ad files for them, possibly concatting them together so they can all be cirro doritod)
    - plot histogram of cortical depth distro of celltypes of interest vs all other cells pre and post 2 permutations.