In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# graph libraries
import networkx as nx
import sklearn.neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import radius_neighbors_graph
from scipy.spatial import distance
import skimage

import utils as ut
from importlib import reload
reload(ut)

<module 'utils' from '/home/cstansbu/git_repositories/stx_graph/notebooks/utils.py'>

In [2]:
cardOutDir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/CARDOuputs/"

dfList = []

for f in os.listdir(cardOutDir):
    if 'global' in f:
        fullPath = f"{cardOutDir}{f}"
        key = f.split("_")[0]
        df = pd.read_csv(fullPath)
        df = df.rename(columns={'Unnamed: 0' : 'spotId'})
        df['key'] = key
        dfList.append(df)

df = pd.concat(dfList, ignore_index=True)
print(f"{df.shape=}")
print(df['key'].value_counts())
print()
df.head()

df.shape=(7418, 13)
key
HFD8     3391
ND       2034
HFD14    1993
Name: count, dtype: int64



Unnamed: 0,spotId,B cells,Mac1,Mac3,Monocytes,T cells,NK cells,Stromal cells,Dendritic cells,Mac4,Mac2,Mac5,key
0,AAACAAGTATCTCCCA.1_ND,0.018691,0.00941,0.010737,0.070782,0.040298,0.077389,0.645332,0.015281,0.045516,0.039354,0.027211,ND
1,AAACAGCTTTCAGAAG.1_ND,0.008378,0.0057,0.038038,0.089932,0.048618,0.103983,0.575021,0.046758,0.020027,0.035451,0.028094,ND
2,AAACAGGGTCTATATT.1_ND,0.024325,0.010761,0.024323,0.117606,0.072993,0.121093,0.478383,0.026608,0.042051,0.048258,0.033598,ND
3,AAACATTTCCCGGATT.1_ND,0.023002,0.014704,0.044181,0.087019,0.053537,0.083922,0.465958,0.053223,0.087758,0.037452,0.049242,ND
4,AAACCCGAACGAAATC.1_ND,0.021954,0.003784,0.00124,0.03802,0.026386,0.061364,0.77517,0.003725,0.023106,0.025914,0.019336,ND


In [3]:
sptDir =  "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/CARDInputs/"

spt = {}

for f in os.listdir(sptDir):
    if "spt" in f and "global" in f:
        fullPath = f"{sptDir}{f}"
        key = f.split("_")[0]
        sdf = pd.read_csv(fullPath)
        sdf = sdf.rename(columns={'Unnamed: 0' : 'gene'})
        sdf = sdf.set_index('gene')
        sdf = sdf.T
        sdf.index = sdf.index.str.replace("-", ".")
        sdf = ut.normalize(sdf, 1e6)
        spt[key] = sdf
        print(f, sdf.shape)
    
print('done')

HFD8_global_spt.csv (3394, 31053)
ND_global_spt.csv (2036, 31053)
HFD14_global_spt.csv (1994, 31053)
done


In [4]:
coordDir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/CARDInputs/"

dfList = []

for f in os.listdir(coordDir):
    if "coord" in f and "global" in f:
        fullPath = f"{coordDir}{f}"
        key = f.split("_")[0]
        cdf = pd.read_csv(fullPath)
        cdf = cdf.rename(columns={'Unnamed: 0' : 'spotId'})
        cdf['key'] = key
        dfList.append(cdf)

cdf = pd.concat(dfList, ignore_index=True)
print(f"{cdf.shape=}")
print(cdf['key'].value_counts())
print()
cdf.head()

cdf.shape=(7424, 4)
key
HFD8     3394
ND       2036
HFD14    1994
Name: count, dtype: int64



Unnamed: 0,spotId,x,y,key
0,AAACAAGTATCTCCCA.1_HFD8,1244.795278,1189.669724,HFD8
1,AAACACCAATAACTGC.1_HFD8,405.548324,1349.579809,HFD8
2,AAACAGCTTTCAGAAG.1_HFD8,303.880251,1068.178931,HFD8
3,AAACAGGGTCTATATT.1_HFD8,344.496391,1138.478061,HFD8
4,AAACAGTGTTCCTGGG.1_HFD8,648.734268,1595.524585,HFD8


In [5]:
"""Merge CARD predictions with spatial coordinates"""
df = pd.merge(cdf, df,
              how='left', 
              left_on=['spotId', 'key'],
              right_on=['spotId', 'key'])

df = df.fillna(0)
df = df.drop_duplicates() # very important!
print(df['key'].value_counts())
df.head()

key
HFD8     3394
ND       2036
HFD14    1994
Name: count, dtype: int64


Unnamed: 0,spotId,x,y,key,B cells,Mac1,Mac3,Monocytes,T cells,NK cells,Stromal cells,Dendritic cells,Mac4,Mac2,Mac5
0,AAACAAGTATCTCCCA.1_HFD8,1244.795278,1189.669724,HFD8,0.077635,0.022735,0.104811,0.111453,0.125867,0.195064,0.059765,0.111978,0.082445,0.066571,0.041676
1,AAACACCAATAACTGC.1_HFD8,405.548324,1349.579809,HFD8,0.076987,0.041831,0.047343,0.139484,0.082711,0.093085,0.148247,0.071763,0.058744,0.142045,0.09776
2,AAACAGCTTTCAGAAG.1_HFD8,303.880251,1068.178931,HFD8,0.079216,0.043327,0.075868,0.086821,0.086695,0.097747,0.186142,0.09052,0.06967,0.094246,0.089748
3,AAACAGGGTCTATATT.1_HFD8,344.496391,1138.478061,HFD8,0.037682,0.029618,0.093281,0.109521,0.052151,0.061401,0.201332,0.094533,0.143671,0.088173,0.088637
4,AAACAGTGTTCCTGGG.1_HFD8,648.734268,1595.524585,HFD8,0.092691,0.001555,0.002745,0.075636,0.054631,0.052849,0.112912,0.008552,0.258918,0.056771,0.28274


In [6]:
## relabel some things to make things easier and save

keyMap = {
    'ND' : 1,
    'HFD8' : 2,
    'HFD14' : 3,
}

df['nodeLabel'] = df.groupby('key').cumcount() + 1
df["keyId"] = df["key"].map(keyMap)
df['nodeId'] = df["keyId"].astype(str) + "_" + df['nodeLabel'].astype(str)

outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/"
fname = "global_card_outputs.csv"

df.to_csv(f"{outdir}{fname}", index=False)
df.head()

Unnamed: 0,spotId,x,y,key,B cells,Mac1,Mac3,Monocytes,T cells,NK cells,Stromal cells,Dendritic cells,Mac4,Mac2,Mac5,nodeLabel,keyId,nodeId
0,AAACAAGTATCTCCCA.1_HFD8,1244.795278,1189.669724,HFD8,0.077635,0.022735,0.104811,0.111453,0.125867,0.195064,0.059765,0.111978,0.082445,0.066571,0.041676,1,2,2_1
1,AAACACCAATAACTGC.1_HFD8,405.548324,1349.579809,HFD8,0.076987,0.041831,0.047343,0.139484,0.082711,0.093085,0.148247,0.071763,0.058744,0.142045,0.09776,2,2,2_2
2,AAACAGCTTTCAGAAG.1_HFD8,303.880251,1068.178931,HFD8,0.079216,0.043327,0.075868,0.086821,0.086695,0.097747,0.186142,0.09052,0.06967,0.094246,0.089748,3,2,2_3
3,AAACAGGGTCTATATT.1_HFD8,344.496391,1138.478061,HFD8,0.037682,0.029618,0.093281,0.109521,0.052151,0.061401,0.201332,0.094533,0.143671,0.088173,0.088637,4,2,2_4
4,AAACAGTGTTCCTGGG.1_HFD8,648.734268,1595.524585,HFD8,0.092691,0.001555,0.002745,0.075636,0.054631,0.052849,0.112912,0.008552,0.258918,0.056771,0.28274,5,2,2_5


In [7]:
## save node ID map
gf = df.copy()
gf = gf[['spotId', 'key', 'nodeLabel', 'keyId', 'nodeId']]
gf['spot'] = gf['spotId'].apply(lambda x: x.split("_")[0])

outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/"
fname = "node_names.csv"

gf.to_csv(f"{outdir}{fname}", index=False)
gf.head()


Unnamed: 0,spotId,key,nodeLabel,keyId,nodeId,spot
0,AAACAAGTATCTCCCA.1_HFD8,HFD8,1,2,2_1,AAACAAGTATCTCCCA.1
1,AAACACCAATAACTGC.1_HFD8,HFD8,2,2,2_2,AAACACCAATAACTGC.1
2,AAACAGCTTTCAGAAG.1_HFD8,HFD8,3,2,2_3,AAACAGCTTTCAGAAG.1
3,AAACAGGGTCTATATT.1_HFD8,HFD8,4,2,2_4,AAACAGGGTCTATATT.1
4,AAACAGTGTTCCTGGG.1_HFD8,HFD8,5,2,2_5,AAACAGTGTTCCTGGG.1


In [8]:
# save the spatial data
keys = sorted(df['key'].unique(), reverse=True)

outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/gene_expression/"

for key in keys:

    # get new node names
    keyDf = gf[gf['key'] == key]
    keyMap = pd.Series(keyDf.nodeId.values,index=keyDf.spotId).to_dict()

    # load spatial data
    sdf = spt[key]

    # convert index
    sdf.index = sdf.index.map(keyMap)

    # drop zero-sum columns
    sdf = sdf.loc[:, (sdf.sum(axis=0) != 0)]

    fname  = f"{key}_spatial_cpm.csv"
    sdf.to_csv(f"{outdir}{fname}", index=True)
    print(f"{key} {sdf.shape=}")
    

print('done')

ND sdf.shape=(2036, 11288)
HFD8 sdf.shape=(3394, 13208)
HFD14 sdf.shape=(1994, 12703)
done


In [9]:
# make a simple coordinates table

cf = df.copy()
cf = cf[['spotId', 'key', 'nodeId', 'x', 'y']]

outdir = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/graph_data/"
fname = "coordinates.csv"

cf.to_csv(f"{outdir}{fname}", index=False)
cf.head()


Unnamed: 0,spotId,key,nodeId,x,y
0,AAACAAGTATCTCCCA.1_HFD8,HFD8,2_1,1244.795278,1189.669724
1,AAACACCAATAACTGC.1_HFD8,HFD8,2_2,405.548324,1349.579809
2,AAACAGCTTTCAGAAG.1_HFD8,HFD8,2_3,303.880251,1068.178931
3,AAACAGGGTCTATATT.1_HFD8,HFD8,2_4,344.496391,1138.478061
4,AAACAGTGTTCCTGGG.1_HFD8,HFD8,2_5,648.734268,1595.524585


In [10]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
break