In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import gget

from sklearn.feature_extraction.text import CountVectorizer

import sys
sys.path.append('../')
import utils as ut

In [2]:
pd.__version__

'2.0.1'

In [2]:
fpath = "/nfs/turbo/umms-indikar/Joshua/Main/Projects/WoundHealing/SpatialData/04272023_edgeset.csv"


vertexPath = '/nfs/turbo/umms-indikar/Joshua/Main/Projects/WoundHealing/SpatialData/04272023_vertexLabels.txt'
edgeMap = {i: x.strip().split(" - ") for i, x in enumerate(open(vertexPath))}


df = pd.read_csv(fpath, header=None, names=['n1', 'n2', 'n3'])
index = [f"h{i}" for i in range(len(df))]
df['index'] = index
df = df.set_index('index')

df['n1'] = df['n1'].map(edgeMap)
df['n2'] = df['n2'].map(edgeMap)
df['n3'] = df['n3'].map(edgeMap)

print(f"{df.shape=}")
df.head()

df.shape=(7482, 3)


Unnamed: 0_level_0,n1,n2,n3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
h0,"[Stromal cells, ACADM]","[Stromal cells, HP]","[T cells, BC1]"
h1,"[Stromal cells, ACADM]","[Stromal cells, HP]","[T cells, ATP13A3]"
h2,"[Stromal cells, ACADM]","[Stromal cells, BC1]","[T cells, HP]"
h3,"[Stromal cells, ACADM]","[Stromal cells, ATP13A3]","[T cells, HP]"
h4,"[NK cells, TPM3]","[Stromal cells, HP]","[T cells, FTL1]"


In [4]:
# get everything in it's own column

hdf = []

for i, c in enumerate(df.columns):
    print(i, c)
    
    tdf = df[c].apply(pd.Series)
    tdf.columns = [f'c{i+1}', f'g{i+1}']
    hdf.append(tdf)
    
hdf = pd.concat(hdf, axis=1)
print(f"{hdf.shape=}")
hdf.head()

0 n1
1 n2
2 n3
hdf.shape=(7482, 6)


Unnamed: 0_level_0,c1,g1,c2,g2,c3,g3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
h0,Stromal cells,ACADM,Stromal cells,HP,T cells,BC1
h1,Stromal cells,ACADM,Stromal cells,HP,T cells,ATP13A3
h2,Stromal cells,ACADM,Stromal cells,BC1,T cells,HP
h3,Stromal cells,ACADM,Stromal cells,ATP13A3,T cells,HP
h4,NK cells,TPM3,Stromal cells,HP,T cells,FTL1


In [5]:
cellCols = ['c1', 'c2', 'c3']
geneCols = ['g1', 'g2', 'g3']

hdf['cellCount'] = hdf[cellCols].apply(lambda x: len(set(x)), axis=1)
hdf['geneCount'] = hdf[geneCols].apply(lambda x: len(set(x)), axis=1)

hdf.head()

Unnamed: 0_level_0,c1,g1,c2,g2,c3,g3,cellCount,geneCount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
h0,Stromal cells,ACADM,Stromal cells,HP,T cells,BC1,2,3
h1,Stromal cells,ACADM,Stromal cells,HP,T cells,ATP13A3,2,3
h2,Stromal cells,ACADM,Stromal cells,BC1,T cells,HP,2,3
h3,Stromal cells,ACADM,Stromal cells,ATP13A3,T cells,HP,2,3
h4,NK cells,TPM3,Stromal cells,HP,T cells,FTL1,3,3


In [6]:
hdf['cellCount'].describe()

count    7482.000000
mean        2.990377
std         0.097631
min         2.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         3.000000
Name: cellCount, dtype: float64

In [7]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
# any homogenous cell type hyperedges?
pdf = hdf[hdf['cellCount'] == 2]
print(f"{pdf.shape=}")
pdf.head()

In [None]:
# extract the hyperedges as "sentences"

cols = cellCols + geneCols
hdf = hdf[cols].astype(str)
docs = hdf[cols].apply(lambda x: " ".join(x[:6]), axis=1).to_list()

count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
X = count_model.fit_transform(docs)

X[X > 0] = 1 # binarize
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
X = Xc.todense()
print(f"{X.shape=}")

indexMap = {v: k for k, v in count_model.vocabulary_.items()}

# build the cooccurance matrix
counts = pd.DataFrame(X)
counts.columns = [indexMap[c] for c in counts.columns]
counts.index = counts.index.map(indexMap)
counts

In [None]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 10, 10
sns.heatmap(data=np.sqrt(counts),
            square=True,
            lw=0.05)

In [None]:
db = 'ontology'
db = 'KEGG_2019_Mouse'


ef = gget.enrichr(counts.index.to_list(), database=db)
ef.head()

In [None]:
# load the incidence matrices directly

gPath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/kronecker_stuff/B.csv"

G = pd.read_csv(gPath)
G = G.set_index('gene')
print(f"{G.shape=}")

cPath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/kronecker_stuff/A.csv"

C = pd.read_csv(cPath)
C = C.rename(columns={'Unnamed: 0' : 'cell'})
C = C.set_index('cell')
print(f"{C.shape=}")

In [None]:
G.tail()

In [None]:
kid = "04110"
kid = f"mmu{kid}"
pathname = ut.getPathname(kid)
print(f"{pathname=}")
genes = ut.parseKEGG(kid)
genes = [x.upper() for x in genes]
genes = [x for x in genes if x in G.index]

print(len(genes))


In [None]:
# Ghat = G[G.index.isin(genes)]
# print(f"{Ghat.shape=}")

# K = np.kron(C, Ghat)
# print(f"{K.shape=}")

In [None]:
break

In [None]:
A = np.random.random(size=(10, 2))
B = np.random.random(size=(9, 3))

print(f"{A.shape=} {B.shape=}")

C = np.kron(A, B)
print(f"{C.shape=}")

In [None]:
3394 ** 2

In [None]:
# count_model.vocabulary_

In [None]:

# ideas:
#    1. TF - IDF
