In [1]:
import pandas as pd 
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import itertools

# graph libraries
import networkx as nx
import sklearn.neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import radius_neighbors_graph
from scipy.spatial import distance
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
import skimage

# local imports
import utils as ut
import graph as gp
from importlib import reload
reload(ut)
reload(gp)

<module 'graph' from '/home/cstansbu/git_repositories/stx_graph/notebooks/graph.py'>

In [2]:
# load the data
fpath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/global_card_outputs.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
print()
print(df['key'].value_counts())
print()
df.head()


df.shape=(7424, 18)

key
HFD8     3394
ND       2036
HFD14    1994
Name: count, dtype: int64



Unnamed: 0,spotId,x,y,key,B cells,Mac1,Mac3,Monocytes,T cells,NK cells,Stromal cells,Dendritic cells,Mac4,Mac2,Mac5,nodeLabel,keyId,nodeId
0,AAACAAGTATCTCCCA.1_HFD8,1244.795278,1189.669724,HFD8,0.077635,0.022735,0.104811,0.111453,0.125867,0.195064,0.059765,0.111978,0.082445,0.066571,0.041676,1,2,2_1
1,AAACACCAATAACTGC.1_HFD8,405.548324,1349.579809,HFD8,0.076987,0.041831,0.047343,0.139484,0.082711,0.093085,0.148247,0.071763,0.058744,0.142045,0.09776,2,2,2_2
2,AAACAGCTTTCAGAAG.1_HFD8,303.880251,1068.178931,HFD8,0.079216,0.043327,0.075868,0.086821,0.086695,0.097747,0.186142,0.09052,0.06967,0.094246,0.089748,3,2,2_3
3,AAACAGGGTCTATATT.1_HFD8,344.496391,1138.478061,HFD8,0.037682,0.029618,0.093281,0.109521,0.052151,0.061401,0.201332,0.094533,0.143671,0.088173,0.088637,4,2,2_4
4,AAACAGTGTTCCTGGG.1_HFD8,648.734268,1595.524585,HFD8,0.092691,0.001555,0.002745,0.075636,0.054631,0.052849,0.112912,0.008552,0.258918,0.056771,0.28274,5,2,2_5


In [3]:
reload(gp)

keys = df['key'].unique()

outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/distance_mats/"
metric = 'euclidean'
threshold = 25

for key in keys:
    d = gp.get_2D_distances(positions=df[df['key'] == key], 
                            id_column='nodeId', 
                            metric='euclidean')

    fname = f"{key}_{metric}_full.csv"
    d.to_csv(f"{outdir}{fname}", index=True)

    print(f"Saved {fname} {d.shape=}")
    
    # make binary adjancency for nodes within a certain
    # physical radius
    A = np.where(d < threshold, 1, 0)
    np.fill_diagonal(A, 0) # no self loops!
    A = pd.DataFrame(A, index=d.index, columns=d.index)
    fname = f"{key}_{metric}_t{threshold}_adj.csv"
    A.to_csv(f"{outdir}{fname}", index=True)
    print(f"Saved {fname} {A.shape=}")
    
print('done')

Saved HFD8_euclidean_full.csv d.shape=(3394, 3394)
Saved HFD8_euclidean_t25_adj.csv A.shape=(3394, 3394)
Saved HFD14_euclidean_full.csv d.shape=(1994, 1994)
Saved HFD14_euclidean_t25_adj.csv A.shape=(1994, 1994)
Saved ND_euclidean_full.csv d.shape=(2036, 2036)
Saved ND_euclidean_t25_adj.csv A.shape=(2036, 2036)
done


In [4]:
# save the adjancy list 

outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/edge_lists/"

for key in keys:
    d = gp.get_2D_distances(positions=df[df['key'] == key], 
                            id_column='nodeId', 
                            metric='euclidean')

    # make binary adjancency for nodes within a certain
    # physical radius
    A = np.where(d < threshold, 1, 0)
    np.fill_diagonal(A, 0) # no self loops!
    i, j = np.nonzero(A)

    nodeI = d.index[i]
    nodeJ = d.index[j]
    
    edges = pd.DataFrame({'node1' : nodeI, 'node2' : nodeJ})
    edges['key'] = key

    fname = f"{key}_{metric}_t{threshold}_edgelist.csv"
    edges.to_csv(f"{outdir}{fname}", index=False)
    print(f"Saved {fname} {edges.shape=}")

print('done.')


Saved HFD8_euclidean_t25_edgelist.csv edges.shape=(19872, 3)
Saved HFD14_euclidean_t25_edgelist.csv edges.shape=(11430, 3)
Saved ND_euclidean_t25_edgelist.csv edges.shape=(11804, 3)
done.


In [5]:
outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/edge_lists/"

ctypes = {
    'B cells' : "BD",
    'Dendritic cells' : "DC",
    'Mac1' : "M1",
    'Mac2' : "M2",
    'Mac3' : "M3",
    'Mac4' : "M4",
    'Mac5' : "M5",
    'Monocytes' : "MN",
    'NK cells' : "NK",
    'Stromal cells' : "OTH",
    'T cells' : "TC",
}


for key in keys:
    print(f"{key}...")
    kdf = df[df['key'] == key].set_index('nodeId')

    pairs = pd.DataFrame(itertools.combinations(kdf.index, 2))
    pairs.columns = ['node1', 'node2']
    pairs['key'] = key

    # make a general purpose merge column
    merger = kdf.rename(columns=ctypes)
    merger = merger[list(ctypes.values())]

    # merge cell types_i
    combs = pd.merge(pairs, merger.add_suffix("_i"), 
                     how='left',
                     left_on='node1',
                     right_index=True)

    # merge cell types_j
    combs = pd.merge(combs, merger.add_suffix("_j"), 
                     how='left',
                     left_on='node2',
                     right_index=True)

    print(f"{key} {pairs.shape=}")
    
    for ci, cj in itertools.combinations_with_replacement(ctypes.values(), 2):
        # get the column names
        newCol = f"{ci}_{cj}"
        colI  = f"{ci}_i"
        colJ  = f"{cj}_j"

        # compute the harmonic mean between all spots for these two cell types
        pairs[newCol] = scipy.stats.hmean([combs[colI], combs[colJ]], axis=0)
        
    fname = f"{key}_harmonic_edgelist.csv"
    pairs.to_csv(f"{outdir}{fname}", index=False)
    print(f"Saved {fname} {pairs.shape=}")

print('done')

HFD8...
HFD8 pairs.shape=(5757921, 3)
Saved HFD8_harmonic_edgelist.csv pairs.shape=(5757921, 69)
HFD14...
HFD14 pairs.shape=(1987021, 3)
Saved HFD14_harmonic_edgelist.csv pairs.shape=(1987021, 69)
ND...
ND pairs.shape=(2071630, 3)
Saved ND_harmonic_edgelist.csv pairs.shape=(2071630, 69)
done


In [6]:
pairs.head()

Unnamed: 0,node1,node2,key,BD_BD,BD_DC,BD_M1,BD_M2,BD_M3,BD_M4,BD_M5,...,MN_MN,MN_NK,MN_OTH,MN_TC,NK_NK,NK_OTH,NK_TC,OTH_OTH,OTH_TC,TC_TC
0,1_1,1_2,ND,0.01157,0.026706,0.008736,0.024476,0.025065,0.019336,0.022447,...,0.079217,0.084229,0.126049,0.057643,0.088736,0.136418,0.059719,0.608151,0.090423,0.044069
1,1_1,1_3,ND,0.021139,0.021957,0.013658,0.026945,0.021138,0.025879,0.024019,...,0.088375,0.089342,0.123318,0.071871,0.094429,0.133225,0.075126,0.549456,0.131151,0.051928
2,1_1,1_4,ND,0.020624,0.027666,0.016459,0.024936,0.026268,0.030818,0.027096,...,0.078066,0.076794,0.122896,0.060964,0.080523,0.132732,0.06329,0.541169,0.098872,0.045984
3,1_1,1_5,ND,0.020191,0.006212,0.006294,0.021717,0.002326,0.020665,0.019008,...,0.049469,0.065737,0.12972,0.038442,0.068451,0.140728,0.039354,0.704317,0.0507,0.031891
4,1_1,1_6,ND,0.0058,6.6e-05,0.001233,0.016203,7.3e-05,0.014746,0.010819,...,0.040116,0.046854,0.131104,0.019961,0.048217,0.142359,0.020205,0.74716,0.022827,0.018037


In [7]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)