In [27]:
# Import dependencies
%matplotlib inline
import os
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
import anndata
import matplotlib.pyplot as plt
import yaml
import scvi
import ray
import hyperopt
from ray import tune
from scvi import autotune

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

Current date and time = 2024-07-26 11:14:55.086320
-----
anndata     0.9.1
scanpy      1.9.3
-----
PIL                         9.4.0
absl                        NA
aiohttp                     3.9.5
aiohttp_cors                0.7.0
aiosignal                   1.3.1
annotated_types             0.5.0
anyio                       NA
arrow                       1.2.3
asttokens                   NA
async_timeout               4.0.3
attr                        23.1.0
attrs                       23.1.0
babel                       2.14.0
backcall                    0.2.0
backoff                     2.2.1
brotli                      NA
bs4                         4.12.2
certifi                     2024.07.04
cffi                        1.15.1
charset_normalizer          3.2.0
chex                        0.1.83
click                       8.1.5
cloudpickle                 3.0.0
colorama                    0.4.6
comm                        0.1.3
contextlib2                 NA
croniter             

In [28]:
# set a working directory
wdir = '/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/'
os.chdir( wdir )

# create an output directory with today's date and time
year = e.strftime("%Y")
month = e.strftime("%m")
day = e.strftime("%d")
hour = e.strftime('%H')
minute = e.strftime('%M')
dmyt = f'{year}{month}{day}_{hour}-{minute}'
directory = f'{dmyt}_integration-scvi.dir'

# folder structures
RESULTS_FOLDERNAME = f'{directory}/results/'
FIGURES_FOLDERNAME = f'{directory}/figures/'

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)
    
# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

print(directory)

20240726_11-14_integration-scvi.dir


In [29]:
# Read in the yml file
ini = yaml.safe_load(open('integration-scvi.yaml'))
print(yaml.safe_dump(ini))

datadir: 20240722_13-47_concat_norm.dir
neighbours:
  n_pcs: 30
variable_genes:
  batch: patient.seqbatch
  flavor: seurat
  n_genes: 5000



Read in the concatenated object.
In the concat_norm script, normalisation and dim reduction was performed but this is not actually needed here because we will start again from the raw counts. 
The only question is whether to work on the whole object or to subset to hvg (and if so how many)

In [30]:
wdir

'/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/'

In [31]:
# path = os.path.join(wdir, 'concat_norm/results/merged_normalised.h5ad')
# For testing use the subsetted object with only 3 samples in it
path = os.path.join(wdir, ini['datadir'], 'results/merged_normalised.h5ad')
path

'/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/20240722_13-47_concat_norm.dir/results/merged_normalised.h5ad'

In [32]:
print('Reading adata object')

Reading adata object


In [33]:
# This will be the unintegrated reference data
# NB for some integration methods, here the data is subsetted to only hvg (see Alina's tutorial)
adata_ref = sc.read_h5ad(path)
adata_ref

AnnData object with n_obs × n_vars = 69589 × 30639
    obs: 'sample', 'sum', 'detected', 'subsets_mito_percent', 'total', 'log10GenesPerUMI', 'patient', 'age', 'sex', 'ethnicity', 'surgical_procedure', 'disease_status', 'anatomical_site', 'affected_side', 'time_to_freezing', 'sequencing_date', 'microanatomical_site', 'seurat_clusters', 'decontX_contamination', 'sizeFactor', 'scDblFinder.class', 'patient.seqbatch', 'n_genes'
    var: 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches', 'mean', 'std'
    uns: 'hvg', 'log1p', 'microanatomical_site_colors', 'neighbors', 'patient_colors', 'pca', 'sample_colors', 'sequencing_date_colors', 'sex_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'decontX', 'log1p_norm', 'scaled', 'soupX'
    obsp: 'connectivities', 'distances'

In [8]:
print('Adata object read successfully')

Adata object read successfully


In [34]:
# scvi works with raw counts
adata_ref.X = adata_ref.layers['counts'].copy()

In [35]:
# make a new object to perform the integration
adata_scvi = adata_ref.copy()


In [36]:
# take a snapshot
adata_scvi.raw = adata_scvi

In [37]:
# subset to hvg TODO Add parameter for this to be optional
adata_scvi = adata_scvi[:, adata_scvi.var.highly_variable].copy()
adata_scvi


AnnData object with n_obs × n_vars = 69589 × 7000
    obs: 'sample', 'sum', 'detected', 'subsets_mito_percent', 'total', 'log10GenesPerUMI', 'patient', 'age', 'sex', 'ethnicity', 'surgical_procedure', 'disease_status', 'anatomical_site', 'affected_side', 'time_to_freezing', 'sequencing_date', 'microanatomical_site', 'seurat_clusters', 'decontX_contamination', 'sizeFactor', 'scDblFinder.class', 'patient.seqbatch', 'n_genes'
    var: 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches', 'mean', 'std'
    uns: 'hvg', 'log1p', 'microanatomical_site_colors', 'neighbors', 'patient_colors', 'pca', 'sample_colors', 'sequencing_date_colors', 'sex_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'decontX', 'log1p_norm', 'scaled', 'soupX'
    obsp: 'connectivities', 'distances'

Optimise the scVI model using ray

In [38]:
# set up the object and view the available paramaters that can be tuned

model_cls = scvi.model.SCVI
model_cls.setup_anndata(adata_scvi, layer="counts", 
                        batch_key='patient.seqbatch')

scvi_tuner = autotune.ModelTuner(model_cls)
scvi_tuner.info()

In [39]:
# specify which variables will be tested
search_space = {
    "n_latent": tune.choice([10, 30, 50]),
    "n_hidden": tune.choice([60, 128, 256]),
    "n_layers": tune.choice([1, 2, 3]),
    "lr": tune.loguniform(1e-4, 1e-2),
    "gene_likelihood": tune.choice(["nb", "zinb"])
}

In [41]:
ray.init(log_to_driver=False)

2024-07-26 11:16:10,633	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.16
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [42]:
print("Performing parameter tuning")

Performing parameter tuning


In [43]:
# run the optimisation

results = scvi_tuner.fit(
    adata_scvi,
    metric="validation_loss",
    search_space=search_space,
    searcher='hyperopt',
    num_samples=100,
    max_epochs=30,
    resources={"gpu": 1}
)

0,1
Current time:,2024-07-26 11:35:50
Running for:,00:19:31.81
Memory:,96.8/503.5 GiB

Trial name,status,loc,n_latent,n_hidden,n_layers,lr,gene_likelihood,validation_loss
_trainable_e4088a53,TERMINATED,163.1.64.158:3322867,50,60,1,0.002586,zinb,1509.6
_trainable_49c3d8eb,TERMINATED,163.1.64.158:3323090,10,128,1,0.00556295,zinb,1521.25
_trainable_0b4c1abc,TERMINATED,163.1.64.158:3323090,50,256,1,0.000571577,zinb,1591.58
_trainable_a674ea37,TERMINATED,163.1.64.158:3323090,10,60,1,0.000554989,zinb,1663.55
_trainable_19a3a3f4,TERMINATED,163.1.64.158:3323090,10,128,3,0.00339445,zinb,1552.57
_trainable_292015d4,TERMINATED,163.1.64.158:3323090,30,256,1,0.000104798,nb,1770.82
_trainable_83aab986,TERMINATED,163.1.64.158:3323090,50,128,2,0.000105707,nb,1882.56
_trainable_69861ced,TERMINATED,163.1.64.158:3323090,10,60,1,0.00391437,nb,1591.78
_trainable_22a71d74,TERMINATED,163.1.64.158:3323090,50,60,3,0.000191133,nb,1894.65
_trainable_44693976,TERMINATED,163.1.64.158:3323090,50,256,1,0.00131464,zinb,1508.33


2024-07-26 11:16:18,298	INFO tune.py:645 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
2024-07-26 11:35:50,130	INFO tune.py:1143 -- Total run time: 1171.83 seconds (1171.79 seconds for the tuning loop).


In [18]:
print("Parameter tuning complete")
print("Results of parameter tuning")

Parameter tuning complete
Results of parameter tuning


We are looking for the parameters that give the lowest validation loss

In [19]:
print(results.model_kwargs)
print(results.train_kwargs)

{'n_latent': 30, 'n_hidden': 60, 'n_layers': 1, 'gene_likelihood': 'zinb'}
{'plan_kwargs': {'lr': 0.0022991134682944915}}


In [20]:
# find the best parameters
# this loop goes through all the parameters and finds the index with the best result (lowest validation loss)
best_vl = 10000
best_i = 0
for i, res in enumerate(results.results):
    vl = res.metrics['validation_loss']

    if vl < best_vl:
        best_vl = vl
        best_i = i



In [21]:
print("Index of optimal parameters")
best_i

Index of optimal parameters


63

In [22]:
print("Optimal parameters")
results.results[best_i]

Optimal parameters


Result(
  metrics={'validation_loss': 1018.7122192382812},
  path='/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/ray/tune_scvi_2024-07-26-10:50:07/_trainable_29df8f7f_64_gene_likelihood=zinb,lr=0.0023,n_hidden=60,n_latent=30,n_layers=1_2024-07-26_10-50-41',
  filesystem='local',
  checkpoint=None
)

In [40]:
ray.shutdown()

In [None]:
print ("script completed")