In [1]:
import pandas as pd
import numpy as np
import scipy
import os 
import scanpy as sc
import umap
import tabulate
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from importlib import reload
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# locals
import utils as ut
import classes
reload(ut)
reload(classes)

<module 'classes' from '/home/cstansbu/git_repositories/spatial_transcriptomics/clustering/classes.py'>

In [2]:
# load gene lists 
pandPath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/panglaodb/pandb.tsv.gz"        
pandf = ut.read_panglaodb(pandPath)


controlList = [
    'Adipocyte progenitor cells', 
    'Adipocytes',
    'B cells', 
    'B cells memory', 
    'B cells naive',
    'Gamma delta T cells',
    'Macrophages',
    'Monocytes', 
    'Natural killer T cells',
    'NK cells',
    'T cells', 
    'T cells naive', 
    'T cytotoxic cells',
    'T follicular helper cells', 
    'T helper cells', 
    'T memory cells',
    'T regulatory cells',   
]

pandf = pandf[pandf['cell type'].isin(controlList)]
pandf.head()

Unnamed: 0,species,official gene symbol,cell type,nicknames,ubiquitousness index,product description,gene type,canonical marker,germ layer,organ,sensitivity_human,sensitivity_mouse,specificity_human,specificity_mouse,gene
52,Mm Hs,CEBPA,Adipocyte progenitor cells,C/EBP-alpha|CEBP,0.025,CCAAT enhancer binding protein alpha,protein-coding gene,,Mesoderm,Connective tissue,0.0,0.0,0.0,0.029965,CEBPA
53,Mm Hs,EGFR,Adipocyte progenitor cells,ERBB1|ERBB,0.014,epidermal growth factor receptor,protein-coding gene,,Mesoderm,Connective tissue,0.0,0.0,0.022243,0.012769,EGFR
54,Mm Hs,FGF10,Adipocyte progenitor cells,,0.001,fibroblast growth factor 10,protein-coding gene,1.0,Mesoderm,Connective tissue,0.0,0.0,0.000627,0.001732,FGF10
55,Mm Hs,GSC,Adipocyte progenitor cells,,0.001,goosecoid homeobox,protein-coding gene,,Mesoderm,Connective tissue,0.0,0.0,0.0,0.00077,GSC
56,Mm Hs,MMP3,Adipocyte progenitor cells,STMY1|STMY,0.003,matrix metallopeptidase 3,protein-coding gene,,Mesoderm,Connective tissue,0.0,0.0,0.002193,0.003016,MMP3


In [None]:
reload(classes)
dirPath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/processed/"

adata = {}

for f in os.listdir(dirPath):
    if f.endswith(".csv"):
        dataName = f.split(".")[0]
        fullPath = f"{dirPath}{f}"
        df = pd.read_csv(fullPath)
        df = df.set_index('Row')
        
        data = classes.Data(df, dataName)
        adata[dataName] = data
        print(data.name, data.X.shape)
    
print('done')

ND (1231, 19949)
HFD14 (6258, 19949)


In [None]:
reload(classes)
# preprocess the necessary stuff

for d, data in adata.items():
    data.svd()
    data.getOHT()
    print(d, data.oht)
    
print('done')

In [None]:
reload(classes)
# preprocess the necessary stuff

random_state=0 
n_neighbors=17
min_dist=0.01 
n_components=2
metric='euclidean'

for d, data in adata.items():
    data.UMAP(random_state=0, 
              n_neighbors=n_neighbors,
              min_dist=min_dist, 
              n_components=n_components,
              metric='euclidean')
    
print('done')

In [None]:
k = 13
c = ut.ncolor(k, cmap='viridis')

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 6, 4

for d, data in adata.items():
    data.simpleClustering(k)

    for i in range(k):
        # annotate all cells
        plt.scatter(data.umap_embedding[data.labels == i , 0] , 
                    data.umap_embedding[data.labels == i , 1] , 
                    marker=".",
                    label=i+1,
                    alpha=0.4,
                    c=c[i])
        
    plt.legend(bbox_to_anchor=(1.2, 1.025))
    plt.xlabel("UMAP 1")
    plt.ylabel("UMAP 2")
    plt.title(d)
    plt.show()
    

In [None]:
## warning, this takes a long time!

for d, data in adata.items():
    print(d)
    data.querylClusters()
    print(data.results.keys())
    
print('done')    

In [None]:
sortby = 'median'
useCol = 'mean'

def scoring(matches, useCol=useCol):
    n = len(matches)
    matches['ordering'] = matches.index.map(lambda x: 1/(x+1))
    matches['weightedExp'] = matches['ordering'] * matches[useCol]
    score = matches['weightedExp'].sum() * np.sqrt(n)
    return score

def getScores(label, clusterGenes, pandf, controlList):
    newRows = []
    for ctype in controlList:
        c = pandf[pandf['cell type'] == ctype]
        ctypeGenes = c['gene'].to_list()
        matches = clusterGenes[clusterGenes['gene'].isin(ctypeGenes)].reset_index(drop=True)
        s = scoring(matches)
        
        newCol = f"cluster {label+1}"
            
        row = {
            'type' : ctype,
            newCol : s
        }
        newRows.append(row)

    scores = pd.DataFrame(newRows)
    scores = scores.set_index('type')
    scores = scores.sort_values(by=newCol, ascending=False)
    return scores
    

for d, data in adata.items():
    print("\n", d)
    print("----------------------------------------")
    dfList = []
    for i in range(data.k):
        clusterGenes = data.results[i]
        clusterGenes = clusterGenes.sort_values(by=sortby, ascending=False)
        clusterGenes['gene'] = clusterGenes['Row'].str.upper()
        scores = getScores(i, clusterGenes, pandf, controlList)
        dfList.append(scores)
        
        
    h = pd.concat(dfList, axis=1, ignore_index=False)
    
    
    print(tabulate.tabulate(h, 
                            headers='keys', 
                            numalign='right',
                            showindex=True))
    
    print()
    for c in h.columns:
        maxScore = h[h[c] == h[c].max()].index.values[0]
        print(c, maxScore)

In [None]:
break

In [None]:
dd

In [None]:
break

In [None]:
break

In [None]:
def writeResults(outpath, res):
    with pd.ExcelWriter(outpath) as writer: 
        for k, d in res.items():
            d.to_excel(writer, sheet_name=f'cluster_{k}')
        
    print(f'done writing: {outpath}')
        
        
outpath = f"/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/cluster{key}.xlsx"
writeResults(outpath, res)

In [None]:
break

In [None]:
outpath = f"/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/cluster{key}.xlsx"

with pd.ExcelWriter(outpath) as writer: 
    for k, d in res.items():
        print(k, type(d))
        # d['stat/s'].to_excel(writer, sheet_name=f'cluster_{k}')
        # print(k)