Use previous model to choose the most and least disease specific sequences from real Covid, HIV, and Healthy repertoires:
- 10 individuals per class
- 1000 sequences per isotype
- PCA’ed to 10 components.

Also filter to known important V genes:
- HIV: V4-34, V4-61, V4-4, V3-20
- Covid19: V1-24, V3-13, V3-9, V3-53

Signal to noise ratios: 25%, 50%, 75%

Recapitulates what logistic regression excels at.

In [1]:
from typing import Any, Dict, List, Optional, Union
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import seaborn as sns
import scanpy as sc
import anndata
import genetools
import gc
import shutil
from pathlib import Path
from collections import defaultdict

import malid.external.genetools_scanpy_helpers
from malid import io
from malid.trained_model_wrappers import SequenceClassifier
from malid import config, helpers, logger
from malid.datamodels import (
    GeneLocus,
    TargetObsColumnEnum,
    SampleWeightStrategy,
    healthy_label,
)

In [2]:
# in an effort to generate more convergent sequence clusters, let's filter down V genes
# v_genes_important_to_disease (if disease not listed here, it won't be filtered):
v_gene_filter: Dict[GeneLocus, Dict[str, List[str]]] = {
    GeneLocus.BCR: {
        "HIV": ["IGHV4-34", "IGHV4-61", "IGHV4-4", "IGHV3-20"],
        "Covid19": ["IGHV1-24", "IGHV3-13", "IGHV3-9", "IGHV3-43"],
    },
    GeneLocus.TCR: {},
}

In [3]:
# If we want to generate based on an older dataset version, we can swap it in here:
dataset_version = config.DATASET_VERSION  # "20220930_expand"

config.paths = config.make_paths(
    embedder=config.embedder, dataset_version=dataset_version
)

In [4]:
def copy_metadata(destination_dir: Path):
    destination_dir.mkdir(exist_ok=True, parents=True)
    for fname in config.paths.dataset_specific_metadata.glob("*"):
        shutil.copy2(fname, destination_dir)
        print(fname)

In [5]:
def _sample_one_specimen(
    adata_specimen: anndata.AnnData,
    specimen_label: str,
    gene_locus: GeneLocus,
    disease: str,
    clf: SequenceClassifier,
    n_sequences_per_patient_per_isotype: int,
    fraction_disease_specific: float,
) -> Union[Dict[str, List[np.ndarray]], None]:
    """For one specimen anndata belonging to one gene locus,
    select and return a subset of obsnames,
    marked with identites healthy, not_disease, or true_disease.
    returns None if there were not enough sequences per isotype."""
    # We are already guaranteed, by the construction of the original dataset, that all isotypes are present for this specimen,
    # but their counts aren't guaranteed.
    # Just in case, we will sanity check that all isotypes are present:
    if set(adata_specimen.obs["isotype_supergroup"].unique()) != set(
        helpers.isotype_groups_kept[gene_locus]
    ):
        logger.warning(
            f"Specimen {specimen_label} from disease {disease} missing some isotypes altogether for {gene_locus} - skipping specimen"
        )
        return None

    obs_names_to_keep_for_this_specimen = defaultdict(list)

    for isotype, adata_subset in helpers.anndata_groupby_obs(
        adata_specimen, "isotype_supergroup", observed=False
    ):
        if adata_subset.shape[0] < n_sequences_per_patient_per_isotype:
            logger.warning(
                f"Specimen {specimen_label}, isotype {isotype}, disease {disease}, {gene_locus}: only had {adata_subset.shape[0]} sequences - skipping specimen"
            )
            # stop looking at this patient - don't include any isotypes staged so far
            return None

        # score the sequences - get predicted probabilities for this disease
        # doesn't matter if we use adjusted decision thresholds because that is just reweighting the entire class by a factor. rankings within the class won't change (unless we renormalize the rows)
        featurized = clf.featurize(adata_subset)
        scores = pd.DataFrame(
            clf.predict_proba(
                featurized.X,
            ),
            index=featurized.sample_names,
            columns=clf.classes_,
        )
        # pull out sequence probability for this disease - still indexed by obsname
        scores = scores[disease]
        if scores.isna().any():
            raise ValueError(f"Specimen {specimen_label}: disease_pr contains NaN")

        # get obsnames in sorted order
        sorted_order = scores.sort_values().index.to_series()

        if disease == healthy_label:
            # take a random smattering of n_sequences_per_patient_per_isotype sequences
            obs_names_to_keep_for_this_specimen["healthy"].append(
                sorted_order.sample(
                    n=n_sequences_per_patient_per_isotype, random_state=0
                ).values
            )
        else:
            # take from top (unlikely to be disease specific)
            obs_names_to_keep_for_this_specimen["not_disease"].append(
                sorted_order.head(
                    n=int(
                        (1 - fraction_disease_specific)
                        * n_sequences_per_patient_per_isotype
                    )
                ).values
            )
            # take from bottom (likely to be disease specific)
            obs_names_to_keep_for_this_specimen["true_disease"].append(
                sorted_order.tail(
                    n=int(
                        fraction_disease_specific * n_sequences_per_patient_per_isotype
                    )
                ).values
            )

    return obs_names_to_keep_for_this_specimen

In [6]:
def sample_from_fold(
    fold_id: int,
    fold_label: str,
    gene_loci: GeneLocus,
    n_specimens_per_disease: int,
    n_sequences_per_patient_per_isotype: int,
    fraction_disease_specific: float,
    diseases_kept: Optional[List[str]] = None,
    v_genes_kept: Optional[Dict[GeneLocus, Dict[str, List[str]]]] = None,
) -> Dict[GeneLocus, anndata.AnnData]:
    """
    Sample from fold for all gene loci simultaneously,
    so that we have matching specimen lists for all gene loci.
    (in other words, a specimen must pass the relevant BCR *and* TCR filters to be included in either)
    """
    # Load data
    adatas: Dict[GeneLocus, anndata.AnnData] = {}
    clfs: Dict[GeneLocus, SequenceClassifier] = {}
    diseases = None

    for gene_locus in gene_loci:
        adata = io.load_fold_embeddings(
            fold_id=fold_id,
            fold_label=fold_label,
            gene_locus=gene_locus,
            # Require that all participants have all demorgaphic columns defined
            target_obs_column=TargetObsColumnEnum.disease_all_demographics_present,
            sample_weight_strategy=SampleWeightStrategy.ISOTYPE_USAGE,
        )

        if diseases_kept is not None:
            adata = adata[adata.obs["disease"].isin(diseases_kept)]

        if adata.obs_names.duplicated().any():
            raise ValueError("obs_names had dupes")

        # Get diseases list and confirm all anndatas match it
        if diseases is None:
            diseases = adata.obs["disease"].unique()
        else:
            if set(diseases) != set(adata.obs["disease"].unique()):
                raise ValueError("Disease list mismatch between anndatas.")

        adatas[gene_locus] = adata.copy()
        del adata
        io.clear_cached_fold_embeddings()
        gc.collect()

        # load individual sequence classifier
        clf = SequenceClassifier(
            fold_id=fold_id,
            model_name_sequence_disease="lasso_multiclass",
            fold_label_train="train_smaller",
            gene_locus=gene_locus,
            # Match above
            target_obs_column=TargetObsColumnEnum.disease_all_demographics_present,
            sample_weight_strategy=SampleWeightStrategy.ISOTYPE_USAGE,
        )
        clfs[gene_locus] = clf

        if not set(diseases) <= set(clf.classes_):
            # all diseases should be in the classifier's classes
            raise ValueError(
                f"Disease list {diseases} should be a subset of (or equal to) clf classes {clf.classes_}"
            )

    obs_names_to_keep_for_all_specimens_by_gene_locus: Dict[
        GeneLocus, Dict[str, List[np.ndarray]]
    ] = {gene_locus: defaultdict(list) for gene_locus in gene_loci}

    # Sample from each disease.
    for disease in diseases:
        # For each gene locus,
        # limit the full anndata to sequences originating from that disease's patients and with particular V genes
        adatas_filtered: Dict[GeneLocus, anndata.AnnData] = {}
        for gene_locus, adata in adatas.items():
            adatas_filtered[gene_locus] = adata[adata.obs["disease"] == disease]
            if (
                v_genes_kept is not None
                and v_genes_kept.get(gene_locus) is not None
                and disease in v_genes_kept[gene_locus]
            ):
                adatas_filtered[gene_locus] = adatas_filtered[gene_locus][
                    adatas_filtered[gene_locus]
                    .obs["v_gene"]
                    .isin(v_genes_kept[gene_locus][disease])
                ]

        # track how many specimens we successfully included already
        n_specimens_kept_from_this_disease = 0
        # track which participants these specimens came from
        participants_represented = set()

        # Get participant+specimen list from first anndata (should match the rest)
        first_anndata = next(iter(adatas_filtered.values()))
        specimen_list = (
            first_anndata.obs[["participant_label", "specimen_label"]]
            .drop_duplicates()
            .values
        )

        # Sample a certain number of sequences per isotype (so we get all isotypes)
        for (participant_label, specimen_label) in specimen_list:
            if n_specimens_kept_from_this_disease >= n_specimens_per_disease:
                # we have enough patients already, stop looking at more specimens to add
                break

            if participant_label in participants_represented:
                logger.warning(
                    f"Specimen {specimen_label} from disease {disease} will be skipped because we already have another specimen from same participant {participant_label}"
                )
                # skip to next specimen
                continue

            # Find which obsnames we'd sample from this specimen in each gene locus dataset
            obs_names_to_keep_for_this_specimen_by_locus: Dict[
                GeneLocus, Union[Dict[str, List[np.ndarray]], None]
            ] = {
                gene_locus: _sample_one_specimen(
                    adata_specimen=adatas_filtered[gene_locus][
                        adatas_filtered[gene_locus].obs["specimen_label"]
                        == specimen_label
                    ],
                    specimen_label=specimen_label,
                    gene_locus=gene_locus,
                    disease=disease,
                    clf=clfs[gene_locus],
                    n_sequences_per_patient_per_isotype=n_sequences_per_patient_per_isotype,
                    fraction_disease_specific=fraction_disease_specific,
                )
                for gene_locus in gene_loci
            }

            # Confirm that this specimen passed filters in each gene locus
            if any(
                obsnames_in_one_locus is None
                for obsnames_in_one_locus in obs_names_to_keep_for_this_specimen_by_locus.values()
            ):
                # Not all isotypes were sampled
                # Skip this specimen for all isotypes and gene loci,
                # so we don't have any specimens missing some isotypes (leads to NaNs in model1 feature matrix)
                # or missing in some gene loci (breaks metamodel).
                logger.warning(
                    f"Specimen {specimen_label} from participant {participant_label}, disease {disease} will not be included due to missing/incomplete isotypes (in one or more gene loci)."
                )
            else:
                for (
                    gene_locus,
                    obsnames_in_one_locus,
                ) in obs_names_to_keep_for_this_specimen_by_locus.items():
                    for (
                        sequence_identity,
                        sequence_obsnames,
                    ) in obsnames_in_one_locus.items():
                        obs_names_to_keep_for_all_specimens_by_gene_locus[gene_locus][
                            sequence_identity
                        ].extend(sequence_obsnames)
                n_specimens_kept_from_this_disease += 1
                participants_represented.add(participant_label)
                logger.info(
                    f"Added specimen {specimen_label} from participant {participant_label}, disease {disease}"
                )

        if n_specimens_kept_from_this_disease != n_specimens_per_disease:
            # Confirm we got the right amount of specimens
            raise ValueError(
                f"We selected only {n_specimens_kept_from_this_disease} specimens from {disease}, rather than desired {n_specimens_per_disease} - fold {fold_id}-{fold_label}, {gene_loci}"
            )
        del adatas_filtered
        gc.collect()

    # Actually perform the sampling and arrive at resulting anndatas
    returned_anndatas: Dict[GeneLocus, anndata.AnnData] = {}
    for gene_locus, adata in adatas.items():
        obs_names_to_keep: Dict[
            str, List[np.ndarray]
        ] = obs_names_to_keep_for_all_specimens_by_gene_locus[gene_locus]

        # Flatten lists of indices from all participants (still separated by healthy/not-disease/true-disease sequence identities)
        obs_names_to_keep_flattened: Dict[str, np.ndarray] = {
            sequence_identity: np.array(
                list_of_np_arrays_for_one_sequence_identity
            ).ravel()
            for sequence_identity, list_of_np_arrays_for_one_sequence_identity in obs_names_to_keep.items()
        }

        # Combine all indices across all sequence identities
        all_obs_names_to_keep = np.hstack(list(obs_names_to_keep_flattened.values()))

        # Return adata at selected indices (across all identities)
        # and undo any scaling
        adata_export = adata[all_obs_names_to_keep, :].raw.to_adata()
        del adata
        gc.collect()

        # Mark identities in obs
        adata_export.obs["sequence_identity_is_true_disease"] = pd.Series(
            dtype=pd.CategoricalDtype(categories=obs_names_to_keep_flattened.keys())
        )
        for sequence_identity, obsnames in obs_names_to_keep_flattened.items():
            adata_export.obs.loc[
                obsnames, "sequence_identity_is_true_disease"
            ] = sequence_identity

        # remove unused labels, if these variables are Categoricals
        adata_export.obs["participant_label"] = (
            adata_export.obs["participant_label"]
            .astype("category")
            .cat.remove_unused_categories()
        )
        adata_export.obs["specimen_label"] = (
            adata_export.obs["specimen_label"]
            .astype("category")
            .cat.remove_unused_categories()
        )

        # no need to pass old PCA info along
        del adata_export.obsm

        returned_anndatas[gene_locus] = adata_export

    del adatas
    gc.collect()
    return returned_anndatas

In [7]:
def run(
    output_dir_anndatas: Path,
    gene_loci: GeneLocus,
    n_specimens_per_disease: int,
    n_sequences_per_patient_per_isotype: int,
    fraction_disease_specific: float,
    scale_data=False,
    store_raw_pre_scaling=True,
    pca_n_comps: Optional[int] = None,
    diseases_kept: Optional[List[str]] = None,
    v_genes_kept: Optional[Dict[GeneLocus, Dict[str, List[str]]]] = None,
    write_csvs=False,
    include_global_fold=True,
):
    for fold_id in (
        config.all_fold_ids if include_global_fold else config.cross_validation_fold_ids
    ):
        # These transformations will be fit on train_smaller set and applied to others
        # so they start as None and then will be replaced.
        # indexed by gene_locus - i.e. the transformations are different for each gene locus (because coming from different language models)
        scale_transformers: Dict[GeneLocus, Any] = {
            gene_locus: None for gene_locus in gene_loci
        }
        pca_transformers: Dict[GeneLocus, Any] = {
            gene_locus: None for gene_locus in gene_loci
        }

        for fold_label in ["train_smaller", "validation", "test"]:
            if fold_id == -1 and fold_label == "test":
                # skip: global fold does not have a test set
                continue

            # Sample from fold for all gene loci simultaneously,
            # so that we have matching specimen lists for all gene loci.
            # (in other words, a specimen must pass the relevant BCR *and* TCR filters to be included in either)
            adatas_sampled: Dict[GeneLocus, anndata.AnnData] = sample_from_fold(
                fold_id=fold_id,
                fold_label=fold_label,
                gene_loci=gene_loci,
                n_specimens_per_disease=n_specimens_per_disease,
                n_sequences_per_patient_per_isotype=n_sequences_per_patient_per_isotype,
                fraction_disease_specific=fraction_disease_specific,
                diseases_kept=diseases_kept,
                v_genes_kept=v_genes_kept,
            )

            # Now scale, PCA, and export the data separately for each gene locus (because coming from different language models).
            for gene_locus in gene_loci:
                output_dir_anndatas_for_gene_locus = (
                    output_dir_anndatas / gene_locus.name
                )
                output_dir_anndatas_for_gene_locus.mkdir(exist_ok=True, parents=True)

                fname_out = (
                    output_dir_anndatas_for_gene_locus
                    / f"fold.{fold_id}.{fold_label}.h5ad"
                )
                logger.info(f"Fold {fold_id}-{fold_label}, {gene_locus} -> {fname_out}")

                if scale_data:
                    # Scale inplace and set raw (if requested)
                    # Use transformer if available (starts as None)
                    (
                        adatas_sampled[gene_locus],
                        scale_transformers[gene_locus],
                    ) = malid.external.genetools_scanpy_helpers.scale_anndata(
                        adatas_sampled[gene_locus],
                        scale_transformer=scale_transformers[gene_locus],
                        inplace=True,
                        set_raw=store_raw_pre_scaling,
                    )

                # that N x 1900 matrix is too big. let's save a PCA'ed version for our tests.
                if pca_n_comps is not None:
                    # PCA inplace
                    # Use transformer if available (starts as None)
                    (
                        adatas_sampled[gene_locus],
                        pca_transformers[gene_locus],
                    ) = malid.external.genetools_scanpy_helpers.pca_anndata(
                        adatas_sampled[gene_locus],
                        pca_transformer=pca_transformers[gene_locus],
                        n_components=pca_n_comps,
                        inplace=True,
                    )
                    # Replace .X with X_pca
                    adatas_sampled[gene_locus] = anndata.AnnData(
                        X=adatas_sampled[gene_locus].obsm["X_pca"],
                        obs=adatas_sampled[gene_locus].obs,
                        uns=adatas_sampled[gene_locus].uns,
                    )
                    if adatas_sampled[gene_locus].shape[1] != pca_n_comps:
                        raise ValueError(
                            "PCA did not produce the expected number of components"
                        )

                # Some columns like "cmv" may be all NaN in this simulated dataset.
                # This can lead to an anndata / h5py bug:
                # "TypeError: Can't implicitly convert non-string objects to strings
                # Above error raised while writing key 'cmv' of <class 'h5py._hl.group.Group'> to /"
                # This seems caused by adatas_sampled[gene_locus].obs['cmv'].dtype being dtype('O') instead of dtype("float64").
                # We can cast to float:
                #         for col in adatas_sampled[gene_locus].obs.columns:
                #             if adatas_sampled[gene_locus].obs[col].isna().all():
                #                 adatas_sampled[gene_locus].obs[col] = adatas_sampled[gene_locus].obs[col].astype("float")

                # Reduce disk space usage by removing unnecessary obs columns
                adatas_sampled[gene_locus].obs.drop(
                    columns=list(
                        set(adatas_sampled[gene_locus].obs.columns)
                        - set(
                            adatas_sampled[gene_locus].uns.get(
                                "original_obs_columns", []
                            )
                        )
                        # do not delete this column
                        - {"sequence_identity_is_true_disease"}
                    )
                    + [
                        "num_reads",
                        "total_clone_num_reads",
                        "num_clone_members",
                        "cdr1_seq_aa_q_trim",
                        "cdr2_seq_aa_q_trim",
                        "extracted_isotype",
                        "igh_or_tcrb_clone_id",
                        "cdr3_aa_sequence_trim_len",
                        "disease_subtype",
                    ],
                    errors="ignore",
                    inplace=True,
                )
                # Sanity check: make sure we did not drop these columns
                assert "disease" in adatas_sampled[gene_locus].obs.columns
                assert (
                    "sequence_identity_is_true_disease"
                    in adatas_sampled[gene_locus].obs.columns
                )

                # Also remove any uns keys that were added after the original read-from-disk step within load_fold_embeddings
                for key in set(adatas_sampled[gene_locus].uns.keys()) - set(
                    adatas_sampled[gene_locus].uns.get("original_uns_keys", [])
                ):
                    del adatas_sampled[gene_locus].uns[key]

                # Also remove large string index
                # this is a RangeIndex, but after reading back in, these will become strings automatically (ImplicitModificationWarning: Transforming to str index.)
                adatas_sampled[gene_locus].obs_names = range(
                    adatas_sampled[gene_locus].shape[0]
                )

                # Save some space on this field too
                adatas_sampled[gene_locus].obs["v_mut"] = (
                    adatas_sampled[gene_locus].obs["v_mut"].astype(np.float32)
                )

                # Write to disk
                adatas_sampled[gene_locus].write(fname_out, compression="gzip")
                if write_csvs:
                    adatas_sampled[gene_locus].obs.to_csv(
                        output_dir_anndatas_for_gene_locus
                        / f"fold.{fold_id}.{fold_label}.obs.tsv.gz",
                        index=None,
                        sep="\t",
                    )
                    np.savetxt(
                        output_dir_anndatas_for_gene_locus
                        / f"fold.{fold_id}.{fold_label}.X.tsv.gz",
                        adatas_sampled[gene_locus].X,
                        fmt="%0.4f",
                        delimiter="\t",
                    )

            io.clear_cached_fold_embeddings()
            gc.collect()

# Generate small simulation dataset for end-to-end test

In [8]:
copy_metadata(
    destination_dir=config.paths.tests_snapshot_dir / "dataset_specific_metadata"
)

run(
    output_dir_anndatas=config.paths.tests_snapshot_dir / "scaled_anndatas_dir",
    gene_loci=config.gene_loci_used,
    # in each fold:
    n_specimens_per_disease=3,
    # this is the required number of sequences _after_ filtering by V genes. since we are just doing end to end automated tests, we can keep this small
    n_sequences_per_patient_per_isotype=50,
    fraction_disease_specific=0.9,  # signal to noise ratio
    scale_data=False,  # Don't scale, in order to save space. Handled after the fact (but independently for each fold label) in the test suite directly.
    pca_n_comps=2,  # Reduce dimensions for testing. Technically we would want to scale before running PCA, but doesn't matter in this test example
    diseases_kept=[healthy_label, "HIV", "Covid19"],
    # in an effort to generate more convergent sequence clusters, let's filter down V genes
    # v_genes_important_to_disease (if disease not listed here, it won't be filtered):
    v_genes_kept=v_gene_filter,
    write_csvs=False,
    include_global_fold=True,
)

2022-11-28 22:58:05,651 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.0.train_smaller.h5ad -> /srv/scratch/maximz/cache/66129a966816aa6a85c118c3dc6d9c0f23b3578351875decfa05839b.0.train_smaller.h5ad


/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/all_v_genes.in_order.TCR.txt
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/cross_validation_divisions.participants.tsv
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/all_j_genes.in_order.TCR.txt
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/computed_metadata_for_confounder_model.all.tsv
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/computed_metadata_for_confounder_model.cmv.tsv
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/participant_specimen_disease_map.tsv
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/specimens_kept_in_embedding_anndatas.tsv
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/metadata/all_v_genes.in_order.BCR.txt
/

Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:00:06,565 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.0.train_smaller.h5ad -> /srv/scratch/maximz/cache/487fc65b6188e0f971e85fa94bb74f9031090123a696d2e276e9ac45.0.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:02:52,882 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S033 from participant BFI-0000255, disease HIV


2022-11-28 23:02:54,254 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S038 from participant BFI-0000256, disease HIV


2022-11-28 23:02:55,588 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S043 from participant BFI-0002856, disease HIV


2022-11-28 23:03:10,413 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S036 from participant BFI-0002862, disease Healthy/Background


2022-11-28 23:03:11,459 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S038 from participant BFI-0002866, disease Healthy/Background


2022-11-28 23:03:12,871 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S039 from participant BFI-0002867, disease Healthy/Background






























2022-11-28 23:03:16,901 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S192 from participant BFI-0009120, disease Covid19


2022-11-28 23:03:17,289 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S193 from participant BFI-0009121, disease Covid19


2022-11-28 23:03:17,629 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S001 from participant BFI-0009122, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:03:19,196 - generate_simulation_datasets.ipynb - INFO - Fold 0-train_smaller, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.0.train_smaller.h5ad


2022-11-28 23:03:19,302 - generate_simulation_datasets.ipynb - INFO - Fold 0-train_smaller, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.0.train_smaller.h5ad


2022-11-28 23:03:19,869 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.0.validation.h5ad -> /srv/scratch/maximz/cache/61639fd8147102437b0e19c5e4520775cb48d0caab85eee5476f275d.0.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-28 23:04:20,652 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.0.validation.h5ad -> /srv/scratch/maximz/cache/e6f0b873b2378a089722f0cfcc98363feb98db65d721b93faf8a1581.0.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-28 23:05:33,394 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S018 from participant BFI-0002857, disease HIV


2022-11-28 23:05:34,479 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S040 from participant BFI-0002864, disease HIV


2022-11-28 23:05:35,518 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S035 from participant BFI-0002870, disease HIV


2022-11-28 23:05:41,779 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S040 from participant BFI-0002868, disease Healthy/Background


2022-11-28 23:05:43,371 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-010 from participant BFI-0003059, disease Healthy/Background


2022-11-28 23:05:44,503 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-016 from participant BFI-0003065, disease Healthy/Background






2022-11-28 23:05:45,590 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S094 from participant BFI-0009047, disease Covid19


2022-11-28 23:05:46,098 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S095 from participant BFI-0009048, disease Covid19






2022-11-28 23:05:46,800 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S162 from participant BFI-0009093, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:05:47,521 - generate_simulation_datasets.ipynb - INFO - Fold 0-validation, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.0.validation.h5ad


2022-11-28 23:05:47,567 - generate_simulation_datasets.ipynb - INFO - Fold 0-validation, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.0.validation.h5ad


2022-11-28 23:05:47,942 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.0.test.h5ad -> /srv/scratch/maximz/cache/b9e20d7e814fcc016975bc3fd0baf4c5f4757dea4896be9b7d88e627.0.test.h5ad


Only considering the two last: ['.test', '.h5ad'].


Only considering the two last: ['.test', '.h5ad'].


2022-11-28 23:07:21,388 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.0.test.h5ad -> /srv/scratch/maximz/cache/fae46b10146880d3232317a027e51b3963872943d02aeed4abcbfd9a.0.test.h5ad


Only considering the two last: ['.test', '.h5ad'].


Only considering the two last: ['.test', '.h5ad'].


2022-11-28 23:09:14,607 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S037 from participant BFI-0000254, disease HIV


2022-11-28 23:09:16,011 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S055 from participant BFI-0000258, disease HIV




2022-11-28 23:09:16,595 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S042 from participant BFI-0002854, disease HIV


2022-11-28 23:09:25,018 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S037 from participant BFI-0002861, disease Healthy/Background


2022-11-28 23:09:26,121 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S035 from participant BFI-0002863, disease Healthy/Background


2022-11-28 23:09:27,058 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-008 from participant BFI-0003057, disease Healthy/Background






2022-11-28 23:09:28,097 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S231 from participant BFI-0009036, disease Covid19


2022-11-28 23:09:28,495 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S163 from participant BFI-0009094, disease Covid19






2022-11-28 23:09:28,865 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S007 from participant BFI-0009127, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:09:29,682 - generate_simulation_datasets.ipynb - INFO - Fold 0-test, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.0.test.h5ad


2022-11-28 23:09:29,733 - generate_simulation_datasets.ipynb - INFO - Fold 0-test, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.0.test.h5ad


2022-11-28 23:09:30,170 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.1.train_smaller.h5ad -> /srv/scratch/maximz/cache/b8a94a91382404095debf210c8aae25ae93bea98a6a76c76583ef6b2.1.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:11:55,169 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.1.train_smaller.h5ad -> /srv/scratch/maximz/cache/09d251db3ac451cb83ae9f7885cccf76ecd2233041b3f2f3493368e0.1.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:15:21,636 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S037 from participant BFI-0000254, disease HIV


2022-11-28 23:15:22,875 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S033 from participant BFI-0000255, disease HIV


2022-11-28 23:15:24,259 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S038 from participant BFI-0000256, disease HIV


2022-11-28 23:15:35,856 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S037 from participant BFI-0002861, disease Healthy/Background


2022-11-28 23:15:37,047 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S035 from participant BFI-0002863, disease Healthy/Background


2022-11-28 23:15:41,924 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-002 from participant BFI-0003051, disease Healthy/Background














2022-11-28 23:15:43,896 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S231 from participant BFI-0009036, disease Covid19






2022-11-28 23:15:44,616 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S163 from participant BFI-0009094, disease Covid19






2022-11-28 23:15:45,412 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S013 from participant BFI-0009131, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:15:46,456 - generate_simulation_datasets.ipynb - INFO - Fold 1-train_smaller, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.1.train_smaller.h5ad


2022-11-28 23:15:46,611 - generate_simulation_datasets.ipynb - INFO - Fold 1-train_smaller, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.1.train_smaller.h5ad


2022-11-28 23:15:47,580 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.1.validation.h5ad -> /srv/scratch/maximz/cache/5456ab4b73a69d4db86f62dbc1e3b7f40ad3681a03048431c03b6e77.1.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-28 23:16:43,678 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.1.validation.h5ad -> /srv/scratch/maximz/cache/41c94dcbfe55df41ced23e6ed6d22a261ac7e2edaee04496be20d334.1.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-28 23:17:51,672 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S055 from participant BFI-0000258, disease HIV




2022-11-28 23:17:52,763 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S016 from participant BFI-0002855, disease HIV


2022-11-28 23:17:53,579 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S034 from participant BFI-0002859, disease HIV


2022-11-28 23:17:59,632 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-008 from participant BFI-0003057, disease Healthy/Background


2022-11-28 23:18:01,515 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-021 from participant BFI-0003070, disease Healthy/Background


2022-11-28 23:18:03,760 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-022 from participant BFI-0003071, disease Healthy/Background






2022-11-28 23:18:05,080 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S095 from participant BFI-0009048, disease Covid19














2022-11-28 23:18:06,218 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S192 from participant BFI-0009120, disease Covid19


2022-11-28 23:18:06,485 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S007 from participant BFI-0009127, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:18:07,455 - generate_simulation_datasets.ipynb - INFO - Fold 1-validation, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.1.validation.h5ad


2022-11-28 23:18:07,503 - generate_simulation_datasets.ipynb - INFO - Fold 1-validation, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.1.validation.h5ad


2022-11-28 23:18:07,932 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.1.test.h5ad -> /srv/scratch/maximz/cache/ffeca0ea4f0814bae296a4074147865a0f4003b0f245773aa82aff12.1.test.h5ad


Only considering the two last: ['.test', '.h5ad'].


Only considering the two last: ['.test', '.h5ad'].


2022-11-28 23:19:29,704 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.1.test.h5ad -> /srv/scratch/maximz/cache/ea501b0a2c0c98168ff5bfe46501a3c154c67f621492bcd5b180d587.1.test.h5ad


Only considering the two last: ['.test', '.h5ad'].


Only considering the two last: ['.test', '.h5ad'].


2022-11-28 23:21:03,585 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S043 from participant BFI-0002856, disease HIV


2022-11-28 23:21:04,868 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S035 from participant BFI-0002870, disease HIV


2022-11-28 23:21:06,450 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S025 from participant BFI-0002871, disease HIV


2022-11-28 23:21:15,871 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S036 from participant BFI-0002862, disease Healthy/Background


2022-11-28 23:21:16,828 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S038 from participant BFI-0002866, disease Healthy/Background


2022-11-28 23:21:18,152 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S039 from participant BFI-0002867, disease Healthy/Background










2022-11-28 23:21:19,672 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S094 from participant BFI-0009047, disease Covid19










2022-11-28 23:21:20,685 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S162 from participant BFI-0009093, disease Covid19


2022-11-28 23:21:21,055 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S193 from participant BFI-0009121, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:21:21,990 - generate_simulation_datasets.ipynb - INFO - Fold 1-test, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.1.test.h5ad


2022-11-28 23:21:22,046 - generate_simulation_datasets.ipynb - INFO - Fold 1-test, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.1.test.h5ad


2022-11-28 23:21:22,524 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.2.train_smaller.h5ad -> /srv/scratch/maximz/cache/6cb12e16fcf6e035e853ef2ae1704883bab4d1035687b0c4b8812038.2.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:24:09,698 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.2.train_smaller.h5ad -> /srv/scratch/maximz/cache/7b61a432c6210704d4a9089df1411d38098eb8285b72b51c681038cd.2.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:28:36,959 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S037 from participant BFI-0000254, disease HIV


2022-11-28 23:28:38,827 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S055 from participant BFI-0000258, disease HIV




2022-11-28 23:28:39,598 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S042 from participant BFI-0002854, disease HIV


2022-11-28 23:28:55,050 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S038 from participant BFI-0002866, disease Healthy/Background


2022-11-28 23:28:56,442 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S039 from participant BFI-0002867, disease Healthy/Background


2022-11-28 23:28:57,694 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S040 from participant BFI-0002868, disease Healthy/Background






2022-11-28 23:28:58,975 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S231 from participant BFI-0009036, disease Covid19


2022-11-28 23:28:59,317 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S094 from participant BFI-0009047, disease Covid19






2022-11-28 23:29:00,038 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S162 from participant BFI-0009093, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:29:01,196 - generate_simulation_datasets.ipynb - INFO - Fold 2-train_smaller, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.2.train_smaller.h5ad


2022-11-28 23:29:01,416 - generate_simulation_datasets.ipynb - INFO - Fold 2-train_smaller, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.2.train_smaller.h5ad


2022-11-28 23:29:02,119 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.2.validation.h5ad -> /srv/scratch/maximz/cache/ed9c53a0f9da5f5d4f54e8c2c768b6a9f5aa1c8643b6c29a7fd00eac.2.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-28 23:29:55,206 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.2.validation.h5ad -> /srv/scratch/maximz/cache/1babce0e79ce10e337c1e8d6dc6cb7a471eb2549e1e5448015ea4da1.2.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-28 23:31:56,870 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S037 from participant BFI-0002861, disease Healthy/Background


2022-11-28 23:31:58,250 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S036 from participant BFI-0002862, disease Healthy/Background


2022-11-28 23:31:59,483 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S035 from participant BFI-0002863, disease Healthy/Background


2022-11-28 23:32:02,887 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S036 from participant BFI-0002877, disease HIV


2022-11-28 23:32:03,844 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S039 from participant BFI-0002879, disease HIV


2022-11-28 23:32:05,255 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S030 from participant BFI-0003454, disease HIV














2022-11-28 23:32:06,973 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S193 from participant BFI-0009121, disease Covid19


2022-11-28 23:32:07,326 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S001 from participant BFI-0009122, disease Covid19


2022-11-28 23:32:07,992 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S016 from participant BFI-0009134, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:32:08,756 - generate_simulation_datasets.ipynb - INFO - Fold 2-validation, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.2.validation.h5ad


2022-11-28 23:32:08,813 - generate_simulation_datasets.ipynb - INFO - Fold 2-validation, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.2.validation.h5ad


2022-11-28 23:32:09,230 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.2.test.h5ad -> /srv/scratch/maximz/cache/9644d4b01b5a585916488e1c26d179a5f1b88ac46036cef4312ff617.2.test.h5ad


Only considering the two last: ['.test', '.h5ad'].


Only considering the two last: ['.test', '.h5ad'].


2022-11-28 23:35:14,747 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.2.test.h5ad -> /srv/scratch/maximz/cache/8e9809b501f65af9ef0def9f3976fe040007b72d29524f6b1f34dc74.2.test.h5ad


Only considering the two last: ['.test', '.h5ad'].


Only considering the two last: ['.test', '.h5ad'].


2022-11-28 23:39:44,600 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S033 from participant BFI-0000255, disease HIV


2022-11-28 23:39:46,163 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S038 from participant BFI-0000256, disease HIV


2022-11-28 23:39:47,280 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S018 from participant BFI-0002857, disease HIV


2022-11-28 23:40:00,332 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-002 from participant BFI-0003051, disease Healthy/Background


2022-11-28 23:40:02,962 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-019 from participant BFI-0003068, disease Healthy/Background


2022-11-28 23:40:05,457 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-029 from participant BFI-0003078, disease Healthy/Background














2022-11-28 23:40:08,009 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S095 from participant BFI-0009048, disease Covid19


















2022-11-28 23:40:09,995 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S192 from participant BFI-0009120, disease Covid19


2022-11-28 23:40:10,692 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S013 from participant BFI-0009131, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-28 23:40:12,225 - generate_simulation_datasets.ipynb - INFO - Fold 2-test, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.2.test.h5ad


2022-11-28 23:40:12,290 - generate_simulation_datasets.ipynb - INFO - Fold 2-test, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.2.test.h5ad


2022-11-28 23:40:12,921 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.-1.train_smaller.h5ad -> /srv/scratch/maximz/cache/983a3eb687524fb93125e06d4db2efff1e050e6994b6702f703ebe30.-1.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-28 23:44:42,287 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.-1.train_smaller.h5ad -> /srv/scratch/maximz/cache/f0039420f21c370254f78d4488b4eba3476c46018a5757fa96e259d7.-1.train_smaller.h5ad


Only considering the two last: ['.train_smaller', '.h5ad'].


Only considering the two last: ['.train_smaller', '.h5ad'].


2022-11-29 00:00:23,978 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S033 from participant BFI-0000255, disease HIV


2022-11-29 00:00:25,476 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S038 from participant BFI-0000256, disease HIV


2022-11-29 00:00:26,769 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S043 from participant BFI-0002856, disease HIV


2022-11-29 00:00:45,255 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S036 from participant BFI-0002862, disease Healthy/Background


2022-11-29 00:00:46,352 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S038 from participant BFI-0002866, disease Healthy/Background


2022-11-29 00:00:47,830 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S039 from participant BFI-0002867, disease Healthy/Background






















2022-11-29 00:00:51,435 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S094 from participant BFI-0009047, disease Covid19


2022-11-29 00:00:51,982 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S095 from participant BFI-0009048, disease Covid19


















2022-11-29 00:00:53,676 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S162 from participant BFI-0009093, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-29 00:00:55,018 - generate_simulation_datasets.ipynb - INFO - Fold -1-train_smaller, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.-1.train_smaller.h5ad


2022-11-29 00:00:55,251 - generate_simulation_datasets.ipynb - INFO - Fold -1-train_smaller, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.-1.train_smaller.h5ad


2022-11-29 00:00:56,359 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.-1.validation.h5ad -> /srv/scratch/maximz/cache/2919100142102324f2257538f64cd6727f40a9578b8e11b6d722857d.-1.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-29 00:02:31,719 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221117/embedded/unirep_fine_tuned/anndatas_scaled/TCR/fold.-1.validation.h5ad -> /srv/scratch/maximz/cache/c602d5100519931f6b73a9229008a0cbfb2e29a23d2a14e45479c787.-1.validation.h5ad


Only considering the two last: ['.validation', '.h5ad'].


Only considering the two last: ['.validation', '.h5ad'].


2022-11-29 00:04:45,182 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S037 from participant BFI-0000254, disease HIV


2022-11-29 00:04:47,198 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S055 from participant BFI-0000258, disease HIV




2022-11-29 00:04:47,967 - generate_simulation_datasets.ipynb - INFO - Added specimen M111-S042 from participant BFI-0002854, disease HIV


2022-11-29 00:04:57,386 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S037 from participant BFI-0002861, disease Healthy/Background


2022-11-29 00:04:58,507 - generate_simulation_datasets.ipynb - INFO - Added specimen M124-S035 from participant BFI-0002863, disease Healthy/Background


2022-11-29 00:04:59,432 - generate_simulation_datasets.ipynb - INFO - Added specimen M64-008 from participant BFI-0003057, disease Healthy/Background






2022-11-29 00:05:00,539 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S231 from participant BFI-0009036, disease Covid19


2022-11-29 00:05:01,057 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S163 from participant BFI-0009094, disease Covid19






2022-11-29 00:05:01,521 - generate_simulation_datasets.ipynb - INFO - Added specimen M418-S007 from participant BFI-0009127, disease Covid19


  return anndata.AnnData(



  return anndata.AnnData(



2022-11-29 00:05:02,671 - generate_simulation_datasets.ipynb - INFO - Fold -1-validation, GeneLocus.BCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/BCR/fold.-1.validation.h5ad


2022-11-29 00:05:02,733 - generate_simulation_datasets.ipynb - INFO - Fold -1-validation, GeneLocus.TCR -> /users/maximz/code/boyd-immune-repertoire-classification/tests/snapshot/scaled_anndatas_dir/TCR/fold.-1.validation.h5ad


# Generate simulation dataset



In [9]:
# # copy_metadata(destination_dir=config.paths.simulated_data_dir / "metadata")

# for fraction_disease_specific in [0.25, 0.5, 0.75]:
#     output_dir_anndatas = (
#         config.paths.simulated_data_dir
#         / f"scaled_anndatas_{fraction_disease_specific:0.2f}"
#     )
#     print(output_dir_anndatas)

#     run(
#         output_dir_anndatas=output_dir_anndatas,
#         gene_loci=config.gene_loci_used,
#         # in each fold:
#         n_specimens_per_disease=10,
#         n_sequences_per_patient_per_isotype=100,  # this is the required number of sequences _after_ filtering by V genes
#         fraction_disease_specific=fraction_disease_specific,  # signal to noise ratio
#         scale_data=True,
#         store_raw_pre_scaling=True,
#         pca_n_comps=10,  # or set to None to prevent dimensionality reduction
#         diseases_kept=[healthy_label, "HIV", "Covid19"],
#         # in an effort to generate more convergent sequence clusters, let's filter down V genes
#         # v_genes_important_to_disease (if disease not listed here, it won't be filtered):
#         v_genes_kept=v_gene_filter,
#         write_csvs=True,
#         include_global_fold=False,
#     )