In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
import cProfile, pstats, io
from pstats import SortKey
import warnings
import scanpy as sc
from collections import Counter
import scipy
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
from sklearn import metrics
import gget
import tabulate
import itertools
from sklearn.neighbors import BallTree
from scipy.stats import gmean

# locals
import utils as ut


In [2]:
fPath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/LR/LewisLabUSCS/Mouse/Mouse-2020-Baccin-LR-pairs.xlsx"

lr = pd.read_excel(fPath)
print(lr.shape)

lr['ligand'] = lr['Ligand.Mouse'].str.upper()
lr['receptor'] = lr['Receptor.Mouse'].str.upper()
lr.head()

(2000, 8)


Unnamed: 0,Pair.Name,Ligand.Mouse,Receptor.Mouse,Source,ManualAnnotation,Ligand.CC,Ligand.GO,Reference (PMID / KEGG),ligand,receptor
0,A2m-Lrp1,A2m,Lrp1,Ramilowski,Correct,Secreted,Other,"1702392,10652313, 12194978",A2M,LRP1
1,Adgre5-Cd55,Adgre5,Cd55,Baccin,Correct,Membrane,Other,23447688,ADGRE5,CD55
2,Adipoq-Adipor1,Adipoq,Adipor1,Ramilowski,Correct,Secreted,Other,12802337,ADIPOQ,ADIPOR1
3,Adipoq-Adipor2,Adipoq,Adipor2,Ramilowski,Correct,Secreted,Other,"12802337, 12802330",ADIPOQ,ADIPOR2
4,Adm-Calcrl&Ramp2,Adm,Calcrl&Ramp2,Baccin,Correct,Secreted,Other,10342881,ADM,CALCRL&RAMP2


In [3]:
xyPath =  "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/combinedEmbedding.csv"

xy = pd.read_csv(xyPath)

cTypes = sorted(list(xy['cellTypes'].unique()))
print(cTypes)
print()
xy.head()

['B cells', 'Dendritic cells', 'Macrophages', 'Monocytes', 'NK cells', 'Stromal cells', 'T cells']



Unnamed: 0,x,y,key,clusterId,colors,cellTypes,cellId
0,6.783486,1.839065,ND,4,#9e0142,B cells,AAACGAACAGCAGTAG.1
1,3.089298,-1.72603,ND,3,#fdbf6f,Macrophages,AAAGAACGTCTACAGT.1
2,3.61413,-1.571004,ND,3,#fdbf6f,Macrophages,AAAGAACTCCTCGCAT.1
3,3.086352,6.166065,ND,1,#fdbf6f,Macrophages,AAAGGGCGTTAGGGTG.1
4,3.178745,4.929769,ND,5,#ffffbe,Monocytes,AACAAAGAGTATAGGT.1


In [4]:
cardDir =  "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/CARDInputs/"
keys = ['ND', 'HFD8', 'HFD14']

rna = {}
labels = {}

for key in keys:
    labelPath = f"{cardDir}{key}_macrophage_clusters.csv"
    lf = pd.read_csv(labelPath)
    
    lf = pd.merge(lf, xy[['x', 'y', 'cellId', 'colors']], 
                  how='left',
                  left_on='cellId',
                  right_on='cellId')
    
    labels[key] = lf
    
    
    rnaPath = f"{cardDir}{key}_macrophage_rna.csv"
    rf = pd.read_csv(rnaPath)
    rf = rf.T
    new_header = rf.iloc[0] 
    rf = rf[1:] 
    rf.columns = new_header 
    rf.index.names = ['cellId']
    
    rna[key] = rf

print('done')

done


In [5]:
%load_ext line_profiler

In [6]:
warnings.simplefilter("ignore")
printt = 1000
lrType = "ligand"


res = []
for key in keys:
    lf = labels[key]
    rf = rna[key]
    
    # some duplicate cellIds
    lf = lf[lf['cellId'].isin(rf.index)]
    lf = lf.drop_duplicates(subset='cellId')
    
    # get all expressed ligands or recptors
    expr = list(set([x for x in lr[lrType] if x in rf.columns]))
    exf = rf[expr].astype(bool).astype(int)
    allN = len(expr)
    
    # drop non-expressed ligands or recptors
    colSums = exf.sum(axis=0).ravel()
    exf = exf[exf.columns[np.nonzero(colSums)]]
    expr = exf.columns
    n = exf.shape[1]
    
    print(f"{n} ({n/allN:.3f}%) {lrType}s expressed in {key}")

    # merge cell types with boolean ligand/receptor exp
    oh = pd.get_dummies(lf['cellType'])
    oh = oh.set_index(lf['cellId'])
    exf = pd.merge(exf, oh, 
                   how='left', 
                   left_index=True,
                   right_index=True)


    # prepare the LFC computation
    rf = rf[rf.index.isin(exf.index)]
    rf = rf[expr]

    cTypes = sorted(list(lf['cellType'].unique()))
    
    lcount = 0
    for ctype, q in itertools.product(cTypes, expr):
        lcount += 1
        if lcount % printt == 0:
            print(f"{key} {lcount}")

        y_true = exf[ctype]
        y_pred = exf[q]
        
        # get the log fold change of expression
        mask = y_true == 1
        meanGroup = rf[mask][q].mean(skipna=True)
        meanOther = rf[~mask][q].mean(skipna=True)
        lfc = np.log2(meanGroup) - np.log2(meanOther)
                
        tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()

        row = {
            'key' : key,
            'cellType' : ctype,
            'gene' : q,
            'type' : lrType,
            'true_positive' : tp,
            'true_negative' : tn,
            'false_positive' : fp,
            'false_negative' : fn,
            'sensitivity' : tp / (tp + fn),
            'specificity' : tn / (tn + fp),
            'precision' : tp / (tp + fp),
            'F1' : (2*tp) / ((2*tp) + fp + fn),
            'meanGroup' : meanGroup, 
            'meanOther' : meanOther,
            'LFC' : lfc
        }

        res.append(row)
    
        
res = pd.DataFrame(res)
outpath = f"/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/LRspec_{lrType}.csv"
res.to_csv(outpath, index=False)
print(f"{res.shape=}")
    
res.head()

542 (0.725%) ligands expressed in ND
ND 1000
ND 2000
ND 3000
ND 4000
ND 5000
560 (0.749%) ligands expressed in HFD8
HFD8 1000
HFD8 2000
HFD8 3000
HFD8 4000
HFD8 5000
HFD8 6000
550 (0.735%) ligands expressed in HFD14
HFD14 1000
HFD14 2000
HFD14 3000
HFD14 4000
HFD14 5000
HFD14 6000
res.shape=(18172, 15)


Unnamed: 0,key,cellType,gene,type,true_positive,true_negative,false_positive,false_negative,sensitivity,specificity,precision,F1,meanGroup,meanOther,LFC
0,ND,B cells,HRAS,ligand,53,703,196,309,0.146409,0.78198,0.212851,0.173486,0.20442,0.307008,-0.58674
1,ND,B cells,CP,ligand,10,790,109,352,0.027624,0.878754,0.084034,0.04158,0.055249,0.401557,-2.861596
2,ND,B cells,HSP90AA1,ligand,291,192,707,71,0.803867,0.213571,0.291583,0.427941,5.51105,6.581758,-0.256146
3,ND,B cells,NECTIN3,ligand,1,895,4,361,0.002762,0.995551,0.2,0.00545,0.002762,0.005562,-1.009597
4,ND,B cells,KDR,ligand,1,885,14,361,0.002762,0.984427,0.066667,0.005305,0.005525,0.047831,-3.113933


In [7]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Receptors

In [8]:
warnings.simplefilter("ignore")
printt = 1000
lrType = "receptor"


res = []
for key in keys:
    lf = labels[key]
    rf = rna[key]
    
    # some duplicate cellIds
    lf = lf[lf['cellId'].isin(rf.index)]
    lf = lf.drop_duplicates(subset='cellId')
    
    # get all expressed ligands or recptors
    expr = list(set([x for x in lr[lrType] if x in rf.columns]))
    exf = rf[expr].astype(bool).astype(int)
    allN = len(expr)
    
    # drop non-expressed ligands or recptors
    colSums = exf.sum(axis=0).ravel()
    exf = exf[exf.columns[np.nonzero(colSums)]]
    expr = exf.columns
    n = exf.shape[1]
    
    print(f"{n} ({n/allN:.3f}%) {lrType}s expressed in {key}")

    # merge cell types with boolean ligand/receptor exp
    oh = pd.get_dummies(lf['cellType'])
    oh = oh.set_index(lf['cellId'])
    exf = pd.merge(exf, oh, 
                   how='left', 
                   left_index=True,
                   right_index=True)


    # prepare the LFC computation
    rf = rf[rf.index.isin(exf.index)]
    rf = rf[expr]

    cTypes = sorted(list(lf['cellType'].unique()))
    
    lcount = 0
    for ctype, q in itertools.product(cTypes, expr):
        lcount += 1
        if lcount % printt == 0:
            print(f"{key} {lcount}")

        y_true = exf[ctype]
        y_pred = exf[q]
        
        # get the log fold change of expression
        mask = y_true == 1
        meanGroup = rf[mask][q].mean(skipna=True)
        meanOther = rf[~mask][q].mean(skipna=True)
        lfc = np.log2(meanGroup) - np.log2(meanOther)
                
        tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()

        row = {
            'key' : key,
            'cellType' : ctype,
            'gene' : q,
            'type' : lrType,
            'true_positive' : tp,
            'true_negative' : tn,
            'false_positive' : fp,
            'false_negative' : fn,
            'sensitivity' : tp / (tp + fn),
            'specificity' : tn / (tn + fp),
            'precision' : tp / (tp + fp),
            'F1' : (2*tp) / ((2*tp) + fp + fn),
            'meanGroup' : meanGroup, 
            'meanOther' : meanOther,
            'LFC' : lfc
        }

        res.append(row)
    
        
res = pd.DataFrame(res)
outpath = f"/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/LRspec_{lrType}.csv"
res.to_csv(outpath, index=False)
print(f"{res.shape=}")
    
res.head()

542 (0.796%) receptors expressed in ND
ND 1000
ND 2000
ND 3000
ND 4000
ND 5000
548 (0.805%) receptors expressed in HFD8
HFD8 1000
HFD8 2000
HFD8 3000
HFD8 4000
HFD8 5000
HFD8 6000
546 (0.802%) receptors expressed in HFD14
HFD14 1000
HFD14 2000
HFD14 3000
HFD14 4000
HFD14 5000
HFD14 6000
res.shape=(17996, 15)


Unnamed: 0,key,cellType,gene,type,true_positive,true_negative,false_positive,false_negative,sensitivity,specificity,precision,F1,meanGroup,meanOther,LFC
0,ND,B cells,HRAS,receptor,53,703,196,309,0.146409,0.78198,0.212851,0.173486,0.20442,0.307008,-0.58674
1,ND,B cells,FZD8,receptor,1,894,5,361,0.002762,0.994438,0.166667,0.005435,0.002762,0.007786,-1.495024
2,ND,B cells,IL15RA,receptor,25,849,50,337,0.069061,0.944383,0.333333,0.114416,0.096685,0.07119,0.441614
3,ND,B cells,NECTIN3,receptor,1,895,4,361,0.002762,0.995551,0.2,0.00545,0.002762,0.005562,-1.009597
4,ND,B cells,KDR,receptor,1,885,14,361,0.002762,0.984427,0.066667,0.005305,0.005525,0.047831,-3.113933


In [None]:
res.head()

In [None]:
lrType

In [None]:
break