# DataLab bespin functions

In [None]:
from pyprojroot import here
%run {here()}/common_utils.ipynb

## Useful imports

In [None]:
import google.auth
from tqdm.notebook import tqdm
from dask_gateway import Gateway
import gzip
import allel
import numpy.testing
import shutil
import numpy as np

## Determine current conda environment

In [None]:
import os
import re

conda_prefix = os.environ["CONDA_PREFIX"]
print("conda_prefix: ", conda_prefix)

current_environment = re.sub("/home/conda/(.+)/envs/(.+)", "\\1/\\2", conda_prefix)
print("current_environment: ", current_environment)

## Dask clusters

In [None]:
_gateway = None
_cluster_options = None
_cluster_environments = None
_cluster_profiles = None


def get_gateway():
    global _gateway
    global _cluster_options
    global _cluster_environments
    global _cluster_profiles
    if _gateway is None:
        _gateway = Gateway()
        _cluster_options = _gateway.cluster_options()
        _cluster_environments = _cluster_options._fields['conda_environment'].options
        _cluster_profiles = _cluster_options._fields['profile'].options
    return _gateway


In [None]:
def new_cluster(*, profile="standard", environment=current_environment, n_workers=0):
    """Convenience function to create a new dask cluster, ensuring
    that the cluster conda environment matches the user's notebook
    kernel environment.
    
    Parameters
    ----------
    profile : {'standard', 'highmem', 'highmem-2'}
        Cluster worker profile.
    environment : str
        Conda environment, defaults to current environment.
        
    Returns
    -------
    cluster : Cluster
        A dask cluster.
        
    """
    
    # access dask gateway
    gateway = get_gateway()
    
    # check params
    if profile not in _cluster_profiles:
        raise ValueError(f"profile {profile} not available, must be one of {_cluster_profiles}")
    if environment not in _cluster_environments:
        raise ValueError(f"environment {environment} not available, must be one of {_cluster_environments}")

    # create the cluster
    print(f"creating cluster with profile {profile} and environment {environment}...")
    cluster = gateway.new_cluster(
        conda_environment=environment, 
        profile=profile
    )
    
    # scale up
    cluster.scale(n_workers)
    
    # access the dask client to connect to cluster
    cluster.get_client()
    
    return cluster


In [None]:
def shutdown_clusters():
    """Convenience function to shutdown all the user's dask clusters."""
    
    gateway = get_gateway()
    for report in gateway.list_clusters():
        print(f"shutting down cluster: {report.name}")
        cluster = gateway.connect(report.name)
        cluster.shutdown()
        

## GCS initialisation

**N.B., to authenticate for gsutil, open a terminal and run:**

```
conda activate binder-4.2.0
gcloud auth login
```

**N.B., to authenticate for gcsfs, open a terminal and run:**

```
conda activate binder-4.2.0
gcloud auth application-default login
```

This will need to be done before running the `init_gcs()` function below.

In [None]:
def init_gcs(*,
    cache_timeout=0,
    requests_timeout=None,
    block_size=None
):
    
    # authenticate
    credentials, project = google.auth.default()
    
    # check project
    assert project == 'malariagen-jupyterhub'
    
    # instantiate filesystem
    gcs = gcsfs.GCSFileSystem(
        token=credentials,
        project=project, 
        cache_timeout=cache_timeout, 
        requests_timeout=requests_timeout, 
        block_size=block_size
    )

    return gcs