In [1]:
import os
import re
import math
import numpy as np
import pandas as pd
from scipy import stats
from typing import Union
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
import anndata as ad
import scanpy as sc
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from tqdm import tqdm 

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Load data

In [3]:
# read in protein data
data_dir = './../../../neighborhood/CRC_related/'
results_dir = './results_crc'
protein = pd.read_csv(os.path.join(data_dir, 'crc_codex/CRC_clusters_neighborhoods_markersV2.csv')) # ~258,000 codex cells

In [5]:
protein_new = protein[protein['ClusterName'] != 'tumor cells']

di = {'macrophages': "Macro", 'smooth muscle': "SmoothMuscle", 'granulocytes': "Granulo",
      'stroma': "Fibro", 'B cells': "B", 'CD4+ T cells': "TCD4", 'CD8+ T cells': "TCD8", 'plasma cells': "Plasma",
      'CD68+ macrophages': "Macro", 'CD11b+ monocytes': "Mono", 'CD11c+ DCs': "DC", 'NK cells': "NK",
     'remove': "remove", 'vasculature': "Endo", 'immune cells': "Other", 'Other T cells': "Other", # remove or need to change
     'immune cells / vasculature': "Other", 'adipocytes': "Other",
      'tumor cells / immune cells': "Other",  'nerves': "Other", 'lymphatics': "Other"}

protein_new['ClusterNameV3'] = protein_new['ClusterNameV2'].map(di)
protein_new = protein_new[protein_new['ClusterNameV3'] != 'remove']

In [6]:
protein_b = protein_new[protein_new['ClusterNameV3'] == 'B']

In [7]:
protein_b.shape

(13043, 102)

In [15]:
rna_adata = ad.read(os.path.join(data_dir, './hacohen_scrna/data/rna_processed_bcell_0307.h5'))

In [16]:
rna_adata

AnnData object with n_obs × n_vars = 13543 × 43113
    var: 'gene_ids', 'feature_types', 'genome'

In [17]:
sc.pp.normalize_total(rna_adata)
sc.pp.log1p(rna_adata)
sc.pp.highly_variable_genes(rna_adata, n_top_genes=2000)

rna_adata_2k = rna_adata[:, rna_adata.var.highly_variable].copy()
rna_adata_2k_df = pd.DataFrame(rna_adata_2k.X.toarray(), columns=rna_adata_2k.var_names)

In [20]:
matching = pd.read_csv('./../../multiomics_spatial/multiomics_fusion/results/crc_codex_rna_b_cell_0307.csv', index_col=False)
matching.shape

(10526, 3)

In [21]:
np.max(matching['mod2_indx'])

13040

In [22]:
np.max(matching['mod1_indx'])

13541

In [27]:
protein_new = protein_b

In [28]:
protein_new = protein_new.reset_index()

In [29]:
protein_new['mod2_indx'] = protein_new.index

In [31]:
protein_rna = protein_new.reset_index() \
                .merge(matching, on='mod2_indx', how='left') \
                .merge(rna_adata_2k_df.reset_index().rename(columns={'index': 'mod1_indx'}), on='mod1_indx', how='left')

In [32]:
protein_rna

Unnamed: 0.2,level_0,index,Unnamed: 0.1,Unnamed: 0,CellID,ClusterID,EventID,File Name,Region,TMA_AB,...,MT-CO2,MT-ATP6,MT-CO3,MT-ND3,MT-ND4,MT-TH,MT-TE,MT-CYB,MT-TP,CH507-154B10.2
0,0,197714,197714,197714,197714,10679,168,reg001_A,reg001,A,...,4.047706,3.313389,2.864985,2.864985,3.529227,0.0,0.0,3.185043,0.0,0.0
1,1,197715,197715,197715,197715,10679,307,reg001_A,reg001,A,...,4.366381,4.490049,4.600093,4.780738,3.819498,0.0,0.0,4.269707,0.0,0.0
2,2,197716,197716,197716,197716,10679,314,reg001_A,reg001,A,...,0.000000,1.633099,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0
3,3,197717,197717,197717,197717,10679,356,reg001_A,reg001,A,...,0.000000,0.000000,2.649437,0.000000,2.278707,0.0,0.0,0.000000,0.0,0.0
4,4,197718,197718,197718,197718,10679,364,reg001_A,reg001,A,...,3.854438,3.664812,4.151177,4.151177,3.795137,0.0,0.0,2.769317,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13038,13038,210752,210752,210752,210752,10679,255134,reg069_B,reg069,B,...,,,,,,,,,,
13039,13039,210753,210753,210753,210753,10679,255257,reg069_B,reg069,B,...,,,,,,,,,,
13040,13040,210754,210754,210754,210754,10679,255391,reg069_B,reg069,B,...,3.767950,3.200020,3.783583,2.854790,3.694406,0.0,0.0,2.930395,0.0,0.0
13041,13041,210755,210755,210755,210755,10679,257726,reg070_B,reg070,B,...,,,,,,,,,,


# Merge with hotspot data

In [42]:
hotspot = pd.read_csv('cells_in_hotspots.csv')
coldspot = pd.read_csv('cells_in_coldspots.csv')

In [39]:
protein_rna['hotspot'] = protein_rna['CellID'].isin(hotspot['CellID'])

In [43]:
protein_rna['coldspot'] = protein_rna['CellID'].isin(coldspot['CellID'])

In [44]:
np.sum(protein_rna['hotspot'])

4685

In [45]:
np.sum(protein_rna['coldspot'])

784

In [46]:
protein_rna.to_csv('protein_rna_matched_CRC_hotspot.csv')