In [1]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import os

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

DPI = 300
FONTSIZE = 20  # 42

sc.settings.set_figure_params(
    scanpy=True, dpi=100, transparent=True, vector_friendly=True, dpi_save=DPI
)
from matplotlib import rcParams

rcParams["pdf.fonttype"] = 42

In [2]:
DIR2SAVE = Path("/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/final_object/20mt/")
DIR2SAVE.mkdir(parents=True, exist_ok=True)

FIG2SAVE = DIR2SAVE.joinpath("figures/")
FIG2SAVE.mkdir(parents=True, exist_ok=True)
# set the global variable: sc.settings.figdir to save all plots
sc.settings.figdir = FIG2SAVE

### Read in final annotations TME + epi 

In [3]:
adata_annot = sc.read_h5ad(DIR2SAVE.joinpath("SMC_KUL_Pelka_Che_Wu_pCRC_annotations.h5ad"))
adata_annot.shape

(246779, 2000)

In [4]:
adata_annot.obs.Annotation_scVI.value_counts()

T-NK-ILC       94119
Epithelial     60526
Myeloid        36905
Plasma         24526
B              19701
Stromal         5367
Endothelial     3047
Mast            2588
Name: Annotation_scVI, dtype: int64

In [5]:
adata_annot.obs.columns

Index(['Patient', 'Sample', 'Cell_type', 'Cell_subtype', 'Tissue', 'Therapy',
       'doublet_score', 'predicted_doublet', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'cell_source', '_scvi_batch', '_scvi_labels',
       'S_score', 'G2M_score', 'phase', 'cell_cycle_diff', 'leiden_scVI',
       'Annotation_scVI_20mt', 'Annotation_scVI_epi',
       'Annotation_scVI_detailed_epi', 'Annotation_scVI_tme',
       'Annotation_scVI_detailed_tme', 'Annotation_scVI',
       'Annotation_scVI_detailed'],
      dtype='object')

### Read in raw data

In [6]:
adata_raw = sc.read_h5ad("/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/20mt/data_integration/SMC_KUL_Pelka_Che_Wu_CRC_raw.h5ad")
adata_raw.shape

(328063, 39609)

In [7]:
np.max(adata_raw.X)  # check data is raw

59020.0

In [8]:
adata_raw.X[0:5, 0:5].todense()

matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], dtype=float32)

In [9]:
adata_raw.obs

Unnamed: 0,Patient,Sample,Cell_type,Cell_subtype,Tissue,Therapy,doublet_score,predicted_doublet,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,cell_source
SMC01-T_AAACCTGCATACGCCG-SMC,SMC01,SMC01-T,Epithelial cells,CMS2,CRC,naive,0.089005,False,4866,38052.0,4917.0,12.921791,8800.0,23.126249,SMC
SMC01-T_AAACCTGGTCGCATAT-SMC,SMC01,SMC01-T,Epithelial cells,CMS2,CRC,naive,0.089005,False,5268,33750.0,2957.0,8.761481,7815.0,23.155556,SMC
SMC01-T_AAACCTGTCCCTTGCA-SMC,SMC01,SMC01-T,Epithelial cells,CMS2,CRC,naive,0.033954,False,1714,7356.0,1450.0,19.711800,2574.0,34.991844,SMC
SMC01-T_AAACGGGAGGGAAACA-SMC,SMC01,SMC01-T,Epithelial cells,CMS2,CRC,naive,0.121019,False,1229,3752.0,358.0,9.541578,1143.0,30.463753,SMC
SMC01-T_AAACGGGGTATAGGTA-SMC,SMC01,SMC01-T,Epithelial cells,CMS2,CRC,naive,0.121019,False,3914,23991.0,4154.0,17.314825,2913.0,12.142054,SMC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P17_Colon_T_TAAGCCATCTATCGCC-1-Wu,Wu2021_P17,P17_Colon_T,B,TCL1A+ Naive B cells,CRC,naive,0.214545,False,515,880.0,14.0,1.590909,250.0,28.409090,Wu
P17_Colon_T_TATACCTTCTAGTTCT-1-Wu,Wu2021_P17,P17_Colon_T,B,TCL1A+ Naive B cells,CRC,naive,0.171779,False,532,1056.0,28.0,2.651515,386.0,36.553032,Wu
P17_Colon_T_TTCTGTAGTGCATTAC-1-Wu,Wu2021_P17,P17_Colon_T,B,AIM2+ Memory B cells,CRC,naive,0.171779,False,757,1686.0,57.0,3.380783,520.0,30.842230,Wu
P19_Colon_T_TCTAACTGTTGCTGAT-1-Wu,Wu2021_P19,P19_Colon_T,B,AIM2+ Memory B cells,CRC,NAC,0.023923,False,642,1457.0,51.0,3.500343,569.0,39.052849,Wu


### Select only barcodes that have annotations for raw data 

In [10]:
adata = adata_raw[adata_raw.obs.index.isin(adata_annot.obs.index)].copy()

In [11]:
adata.shape

(246779, 39609)

### Append annotations TME epi general and detailed

In [12]:
adata.obs = adata.obs.merge(right=adata_annot.obs[["Annotation_scVI", "Annotation_scVI_detailed"]],
                            how="left",
                            left_index=True,
                            right_index=True
                           )

In [13]:
adata.obs.Annotation_scVI.value_counts()

T-NK-ILC       94119
Epithelial     60526
Myeloid        36905
Plasma         24526
B              19701
Stromal         5367
Endothelial     3047
Mast            2588
Name: Annotation_scVI, dtype: int64

In [14]:
adata.obs.Annotation_scVI.isna().sum()

0

In [15]:
adata.obs.Annotation_scVI_detailed.value_counts()

Plasma               24526
B                    19701
CD8 Tex              13812
CD4 Tn               12849
CD8 Tem              10032
Treg                  9746
pEMT                  9603
Stem                  9255
CD4 Th                8345
Treg HSP              7767
SPP1 Mac              7017
Stem (NOTUM high)     6635
C1QC Mac              6449
CD4 Th17              6170
TA1                   5762
Intermediate          5545
T cycling             5200
gdT                   4972
Goblet                4938
Hypoxia               4895
FCN1 Mono             4685
TA2                   4186
CD4 Th HSP            3607
CD4 Tfh               3469
Colonocyte            3069
NK2                   2840
FCGR3A Mono           2697
cDC2                  2664
Mast                  2588
CD8 Tem HSP           2306
Myeloid cycling       2166
PLTP LYVE1 Mac        2150
HLA high              2026
NLRP3 Mac             1887
HSP Mono              1755
ipEMT                 1689
IL1B Mac              1457
N

In [16]:
adata.obs.Annotation_scVI_detailed.isna().sum()

0

### Save raw data 

In [17]:
DIR2SAVE

PosixPath('/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/final_object/20mt')

In [18]:
adata.write(
    DIR2SAVE.joinpath("SMC_KUL_Pelka_Che_Wu_pCRC_annotations_raw.h5ad")
)

In [19]:
adata.shape

(246779, 39609)