In [1]:
# Import dependencies
%matplotlib inline
import os
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
import anndata
import matplotlib.pyplot as plt
import yaml
import scvi
import ray
import hyperopt
from ray import tune
from scvi import autotune

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


Current date and time = 2024-08-29 14:14:18.179136
-----
anndata     0.9.1
scanpy      1.9.3
-----
PIL                         9.4.0
absl                        NA
aiohttp                     3.9.5
aiosignal                   1.3.1
annotated_types             0.5.0
anyio                       NA
arrow                       1.2.3
asttokens                   NA
async_timeout               4.0.3
attr                        23.1.0
attrs                       23.1.0
babel                       2.14.0
backcall                    0.2.0
backoff                     2.2.1
brotli                      NA
bs4                         4.12.2
certifi                     2024.07.04
cffi                        1.15.1
charset_normalizer          3.2.0
chex                        0.1.83
click                       8.1.5
cloudpickle                 3.0.0
colorama                    0.4.6
comm                        0.1.3
contextlib2                 NA
croniter                    NA
cycler                  

In [2]:
# set a working directory
wdir = '/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/'
os.chdir( wdir )

# create an output directory with today's date and time
year = e.strftime("%Y")
month = e.strftime("%m")
day = e.strftime("%d")
hour = e.strftime('%H')
minute = e.strftime('%M')
dmyt = f'{year}{month}{day}_{hour}-{minute}'
directory = f'{dmyt}_ray_autotune.dir'

# folder structures
RESULTS_FOLDERNAME = f'{directory}/results/'
FIGURES_FOLDERNAME = f'{directory}/figures/'

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)
    
# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

print(directory)

20240829_14-14_ray_autotune.dir


In [3]:
# Read in the yml file
ini = yaml.safe_load(open('integration-scvi.yaml'))
print(yaml.safe_dump(ini))

datadir: 20240829_14-04_concat_norm.dir
neighbours:
  n_pcs: 30
variable_genes:
  batch: patient.seqbatch
  flavor: seurat
  hvg_subset: true
  n_genes: 5000



Read in the concatenated object.
In the concat_norm script, normalisation and dim reduction was performed but this is not actually needed here because we will start again from the raw counts. 
The only question is whether to work on the whole object or to subset to hvg (and if so how many)

In [4]:
wdir

'/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/'

In [5]:
path = os.path.join(wdir, ini['datadir'], 'results/merged_normalised.h5ad')
path

'/media/prom/apc1/ccohen/chromium/analysis/20240711_Achilles/20240829_14-04_concat_norm.dir/results/merged_normalised.h5ad'

In [6]:
# This will be the unintegrated reference data
# NB for some integration methods, here the data is subsetted to only hvg (see Alina's tutorial)
adata_ref = sc.read_h5ad(path)
adata_ref

AnnData object with n_obs × n_vars = 69476 × 61552
    obs: 'sample', 'sum', 'detected', 'subsets_mito_percent', 'total', 'log10GenesPerUMI', 'patient', 'age', 'sex', 'ethnicity', 'surgical_procedure', 'disease_status', 'anatomical_site', 'affected_side', 'time_to_freezing', 'sequencing_date', 'microanatomical_site', 'seurat_clusters', 'decontX_contamination', 'sizeFactor', 'scDblFinder.class', 'patient.seqbatch'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches', 'mean', 'std'
    uns: 'hvg', 'log1p', 'microanatomical_site_colors', 'neighbors', 'patient_colors', 'pca', 'sample_colors', 'sequencing_date_colors', 'sex_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'decontX', 'log1p_norm', 'scaled', 'soupX'
    obsp: 'connectivities', 'distances'

In [7]:
# subset to decontX <0.3
adata_ref = adata_ref[adata_ref.obs['decontX_contamination'] < 0.3,:].copy()
adata_ref

AnnData object with n_obs × n_vars = 67668 × 61552
    obs: 'sample', 'sum', 'detected', 'subsets_mito_percent', 'total', 'log10GenesPerUMI', 'patient', 'age', 'sex', 'ethnicity', 'surgical_procedure', 'disease_status', 'anatomical_site', 'affected_side', 'time_to_freezing', 'sequencing_date', 'microanatomical_site', 'seurat_clusters', 'decontX_contamination', 'sizeFactor', 'scDblFinder.class', 'patient.seqbatch'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches', 'mean', 'std'
    uns: 'hvg', 'log1p', 'microanatomical_site_colors', 'neighbors', 'patient_colors', 'pca', 'sample_colors', 'sequencing_date_colors', 'sex_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'decontX', 'log1p_norm', 'scaled', 'soupX'
    obsp: 'connectivities', 'distances'

In [8]:
# scvi works with raw counts
adata_ref.X = adata_ref.layers['counts'].copy()

In [9]:
# make a new object to perform the integration
adata_scvi = adata_ref.copy()


In [10]:
# take a snapshot
adata_scvi.raw = adata_scvi

In [11]:
# subset to hvg if required
if ini['variable_genes']['hvg_subset'] == True: 
    adata_scvi = adata_scvi[:, adata_scvi.var.highly_variable].copy()
    
adata_scvi

AnnData object with n_obs × n_vars = 67668 × 5000
    obs: 'sample', 'sum', 'detected', 'subsets_mito_percent', 'total', 'log10GenesPerUMI', 'patient', 'age', 'sex', 'ethnicity', 'surgical_procedure', 'disease_status', 'anatomical_site', 'affected_side', 'time_to_freezing', 'sequencing_date', 'microanatomical_site', 'seurat_clusters', 'decontX_contamination', 'sizeFactor', 'scDblFinder.class', 'patient.seqbatch'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches', 'mean', 'std'
    uns: 'hvg', 'log1p', 'microanatomical_site_colors', 'neighbors', 'patient_colors', 'pca', 'sample_colors', 'sequencing_date_colors', 'sex_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'decontX', 'log1p_norm', 'scaled', 'soupX'
    obsp: 'connectivities', 'distances'

Optimise the scVI model using ray

In [12]:
# set up the object and view the available paramaters that can be tuned

model_cls = scvi.model.SCVI
model_cls.setup_anndata(adata_scvi, layer="counts", 
                        batch_key='patient.seqbatch')

scvi_tuner = autotune.ModelTuner(model_cls)
scvi_tuner.info()

In [13]:
# specify which variables will be tested
search_space = {
    "n_latent": tune.choice([10, 30, 50]),
    "n_hidden": tune.choice([60, 128, 256]),
    "n_layers": tune.choice([1, 2, 3]),
    "lr": tune.loguniform(1e-4, 1e-2),
    "gene_likelihood": tune.choice(["nb", "zinb"])
}

In [14]:
ray.init(log_to_driver=False)

2024-08-29 14:15:38,306	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.16
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [None]:
# run the optimisation

results = scvi_tuner.fit(
    adata_scvi,
    metric="validation_loss",
    search_space=search_space,
    searcher='hyperopt',
    num_samples=100,
    max_epochs=30,
    resources={"gpu": 1}
)

0,1
Current time:,2024-08-29 14:17:36
Running for:,00:01:56.10
Memory:,424.8/503.5 GiB

Trial name,status,loc,n_latent,n_hidden,n_layers,lr,gene_likelihood,validation_loss
_trainable_262559a5,RUNNING,163.1.64.158:2757207,10,60,3,0.00146256,zinb,1080.01
_trainable_2e923c37,RUNNING,163.1.64.158:2757699,30,256,3,0.000742686,zinb,1063.55
_trainable_d2270652,PENDING,,30,60,2,0.00502355,nb,
_trainable_e615d9f2,TERMINATED,163.1.64.158:2757699,30,256,2,0.000209012,zinb,1169.43
_trainable_875eb43c,TERMINATED,163.1.64.158:2757699,50,128,3,0.00396559,nb,1104.04
_trainable_efdbb223,TERMINATED,163.1.64.158:2757699,10,128,3,0.000832249,nb,1168.01


2024-08-29 14:15:40,108	INFO tune.py:645 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


We are looking for the parameters that give the lowest validation loss

In [None]:
print(results.model_kwargs)
print(results.train_kwargs)

In [None]:
df = results.results.get_dataframe()
df

In [None]:
df2 = df.sort_values(by = 'validation_loss').reset_index()
df2

In [None]:
print("Index of optimal parameters")
row_number = df2['index'][0]
row_number

In [None]:
print("Optimal parameters")
df.iloc[row_number]

In [None]:
ray.shutdown()