In [1]:
import warnings 
warnings.simplefilter('ignore')

import scanpy as sc
import scparadise
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix
import scvi
import pandas as pd
import os

In [2]:
adata = sc.read_text('Mouse_aging_brain/exprMatrix.tsv.gz')
adata = adata.T
adata.X = csc_matrix(adata.X, dtype=np.float32)
meta = pd.read_csv('Mouse_aging_brain/meta.tsv', sep='\t', index_col = 0)
adata.obs = meta
adata.raw = adata

In [3]:
# Find marker genes of cell types
sc.tl.rank_genes_groups(adata, 
                        groupby='Celltype',
                        method='t-test_overestim_var', pts = True)

In [4]:
# Filter marker genes of cell types
sc.tl.filter_rank_genes_groups(adata, 
                               min_fold_change=1.0, 
                               min_in_group_fraction=0.4,
                               key_added='filtered_rank_genes_groups')

In [5]:
# Create list of genes for model training
lst_l1 = []
for i in adata.obs['Celltype'].unique():
    df = sc.get.rank_genes_groups_df(adata, group = i, key='filtered_rank_genes_groups', pval_cutoff=0.05)
    df['pts_comparizon'] = df['pct_nz_group']/df['pct_nz_reference']
    lst_l1.extend(df.sort_values(by='logfoldchanges', ascending=False).head(50)['names'].tolist())
    lst_l1.extend(df.sort_values(by='pts_comparizon', ascending=False).head(50)['names'].tolist())
lst_l1 = np.unique(lst_l1).tolist()

In [6]:
del adata.uns['filtered_rank_genes_groups']
adata.write_h5ad('Mouse_aging_brain/adata_unintegrated.h5ad')

In [7]:
# Save genes for models training
pd.DataFrame(lst_l1, columns = ['genes']).sample(frac=1, random_state=42).to_csv('Mouse_aging_brain/genes_for_AI.csv', index=False, header=True)

In [8]:
# Subset genes
genes = pd.read_csv('Mouse_aging_brain/genes_for_AI.csv')
adata = adata[:, genes.genes].copy()

In [9]:
# Create list of reference samples
lst_reference = ['old1', 'oldex1', 'oldex2', 'young2']

In [10]:
# Create adata_train dataset for model training
adata_train = adata[adata.obs['orig.ident'].isin(lst_reference)].copy()

In [11]:
# Balance dataset
adata_balanced = scparadise.scnoah.balance(adata_train, 
                                           sample='orig.ident',
                                           celltype_l1='Celltype')

Successfully undersampled cell types: EC, MG, AC

Successfully oversampled cell types: SMC, CPC, OPC, PC, MAC, EPC, imNeur, OLG, mNeur, TNC, MNC, NRP, Hb_EC


In [12]:
# Train scadam model using adata_balanced dataset
scparadise.scadam.train(adata_balanced,
                        path='',
                        model_name='Mouse_aging_brain_scAdam',
                        celltype_l1='celltype_l1',
                        eval_metric=['balanced_accuracy','accuracy'])

Successfully saved genes names for training model

Successfully saved dictionary of dataset annotations

Train dataset contains: 24336 cells, it is 90.0 % of input dataset
Test dataset contains: 2704 cells, it is 10.0 % of input dataset

Accelerator: cuda
Start training
epoch 0  | loss: 2.82444 | train_balanced_accuracy: 0.21294 | train_accuracy: 0.21294 | valid_balanced_accuracy: 0.22189 | valid_accuracy: 0.22189 |  0:00:01s
epoch 1  | loss: 2.18031 | train_balanced_accuracy: 0.45488 | train_accuracy: 0.45488 | valid_balanced_accuracy: 0.47152 | valid_accuracy: 0.47152 |  0:00:02s
epoch 2  | loss: 1.52993 | train_balanced_accuracy: 0.63988 | train_accuracy: 0.63988 | valid_balanced_accuracy: 0.64941 | valid_accuracy: 0.64941 |  0:00:03s
epoch 3  | loss: 1.05558 | train_balanced_accuracy: 0.83954 | train_accuracy: 0.83954 | valid_balanced_accuracy: 0.83173 | valid_accuracy: 0.83173 |  0:00:04s
epoch 4  | loss: 0.7667  | train_balanced_accuracy: 0.89608 | train_accuracy: 0.89608 | valid

In [13]:
# Create list of test datasets
lst_test = ['young4', 'old2', 'old4', 'young1', 'oldex4']

In [18]:
for folder in lst_test:
    os.makedirs(os.path.join('Mouse_aging_brain/reports_model_Mouse_brain_scAdam_default', folder))
    adata_test = adata[adata.obs['orig.ident'].isin([folder])].copy()
    # Predict annotation levels using pretrained scadam model
    adata_test = scparadise.scadam.predict(adata_test, 
                                           path_model = 'Mouse_aging_brain_scAdam')
    # Create and save classification report of annotation levels
    scparadise.scnoah.report_classif_full(adata_test, 
                                          celltype = 'Celltype', 
                                          pred_celltype = 'pred_celltype_l1', 
                                          report_name = 'report_test_model_scAdam_default_celltype_l1.csv',
                                          save_path = os.path.join('Mouse_aging_brain/reports_model_Mouse_brain_scAdam_default', folder).replace("\\","/"),
                                          save_report = True)

Successfully loaded list of genes used for training model

Successfully loaded dictionary of dataset annotations

Successfully loaded model

Successfully added predicted celltype_l1 and cell type probabilities
Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded dictionary of dataset annotations

Successfully loaded model

Successfully added predicted celltype_l1 and cell type probabilities
Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded dictionary of dataset annotations

Successfully loaded model

Successfully added predicted celltype_l1 and cell type probabilities
Successfully saved report

Successfully loaded list of genes used for training model

Successfully loaded dictionary of dataset annotations

Successfully loaded model

Successfully added predicted celltype_l1 and cell type probabilities
Successfully saved report

Successfully loaded list of genes used for training 

In [7]:
pip list

Package                   Version
------------------------- --------------
absl-py                   2.1.0
adjustText                1.3.0
aiobotocore               2.5.4
aiohappyeyeballs          2.4.2
aiohttp                   3.8.4
aioitertools              0.12.0
aiosignal                 1.3.1
airr                      1.5.1
alembic                   1.13.3
anndata                   0.9.1
annoy                     1.17.3
anyio                     3.6.2
appdirs                   1.4.4
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
array_api_compat          1.8
arrow                     1.2.3
asciitree                 0.3.3
asttokens                 2.4.1
async-lru                 2.0.4
async-timeout             4.0.2
attrs                     23.1.0
awkward                   2.7.1
awkward_cpp               42
babel                     2.16.0
bamnostic                 1.1.10
bbknn                     1.6.0
bcrypt                    4.2.1
beautifulsoup4            