In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F

from torch.utils.data import DataLoader
import torchvision
from torchvision import datasets, transforms
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import scanpy as sc
import plotly.express as px
import plotly.io as pio
import sklearn.preprocessing
import sklearn.model_selection





torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f9c643aa7f0>

In [2]:
import platform

def get_device_and_gmount():
    # Get the operating system and version
    os = platform.system()
    version = platform.release()

    # Get the machine's architecture
    arch = platform.machine()

    # Set the default renderer based on the operating system
    if os == 'Darwin':
        pio.renderers.default = 'notebook'
        print("Using Apple MPS on Macbook Pro")
    
    elif os == 'Linux' and version == '18.04':
        pio.renderers.default = 'colab'
        print("Using Colab on Linux")
        from google.colab import drive
        drive.mount('/content/drive')
        path = '/content/drive/My Drive/Colab Notebooks/Experiments/'

    # Set the device based on the machine's architecture
    if arch == 'x86_64':
        device = torch.device('mps') if os == 'Darwin' else torch.device('cuda')
        gmount = True if os == 'Linux' else False
    else:
        device = torch.device('cpu')
        gmount = False

    print("Using device:", device)
    
    return device, gmount


In [3]:
device, gmount = get_device_and_gmount()


Using Apple MPS on Macbook Pro
Using device: mps


In [4]:
if gmount:
    scdata = sc.read_h5ad("/content/gdrive/MyDrive/scintegration/GEX.h5ad")
    
scdata = sc.read_h5ad("/Users/eamonmcandrew/Desktop/Single_cell_integration/Data/Multi-ome/GEX.h5ad")

In [5]:
scdata

AnnData object with n_obs × n_vars = 69249 × 13431
    obs: 'GEX_pct_counts_mt', 'GEX_n_counts', 'GEX_n_genes', 'GEX_size_factors', 'GEX_phase', 'ATAC_nCount_peaks', 'ATAC_atac_fragments', 'ATAC_reads_in_peaks_frac', 'ATAC_blacklist_fraction', 'ATAC_nucleosome_signal', 'cell_type', 'batch', 'ATAC_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker'
    var: 'feature_types', 'gene_id'
    uns: 'ATAC_gene_activity_var_names', 'dataset_id', 'genome', 'organism'
    obsm: 'ATAC_gene_activity', 'ATAC_lsi_full', 'ATAC_lsi_red', 'ATAC_umap', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'

In [6]:
platform.system()

'Darwin'

In [7]:
def stratified_split(data, test_size, random_state, split_criteria):
    """
    Splits the data into train and test sets stratified by the batch column
    """
    train = []
    test = []
    for batch in data.obs[split_criteria].unique():
        batch_data = data[data.obs[split_criteria] == batch]
        batch_train, batch_test = sklearn.model_selection.train_test_split(batch_data, test_size=test_size, random_state=random_state)
        batch_train, batch_test = list(batch_train.obs.index), list(batch_test.obs.index)
        train.extend(batch_train)
        test.extend(batch_test)
        
    return train, test


In [8]:
train, test = stratified_split(scdata, 0.2, 9000, split_criteria='cell_type')

In [9]:
train_data = scdata[train]
test_data = scdata[test]

len(train_data), len(test_data)

(55392, 13857)

In [10]:
if gmount == True:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/My Drive/Colab Notebooks/Experiments/' 
    scdata = sc.read_h5ad("/content/gdrive/MyDrive/scintegration/GEX.h5ad")

In [11]:
import wandb
wandb.login()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meamomc[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
# wandb.init(project="Single Cell Omics integration", entity="scintegration")

In [13]:
class GEX_Dataset(torch.utils.data.Dataset):
    def __init__(self, data, scaler=None, cat_var=None, label_encoder=None):
        self.data = data
        self.values = np.asarray(data.X.todense())
        self.cat_var = cat_var

        label_encoder_functions = {
            "numeric": lambda: torch.tensor(sklearn.preprocessing.LabelEncoder().fit_transform(self.data.obs[self.cat_var]), dtype=torch.long),
            "range_map": lambda: sklearn.preprocessing.LabelEncoder().fit_transform(self.data.obs[self.cat_var]).reshape(-1, 1),
            "one_hot": lambda: sklearn.preprocessing.OneHotEncoder().fit_transform(sklearn.preprocessing.LabelEncoder().fit_transform(self.data.obs[self.cat_var]).reshape(-1, 1)).toarray()
        }

        if label_encoder in label_encoder_functions:
            cat_var_data = label_encoder_functions[label_encoder]()
            if label_encoder == "range_map":
                cat_var_data = torch.tensor(sklearn.preprocessing.MinMaxScaler().fit_transform(cat_var_data), dtype=torch.float32)
            elif label_encoder == "one_hot":
                cat_var_data = torch.tensor(cat_var_data, dtype=torch.float32)
        else:
            cat_var_data = None
        self.cat_var_data = cat_var_data

        scaler_functions = {
            "Standard": lambda: sklearn.preprocessing.StandardScaler().fit_transform(self.values),
            "MinMax": lambda: sklearn.preprocessing.MinMaxScaler().fit_transform(self.values)
        }

        if scaler in scaler_functions:
            self.scaled_values = torch.tensor(scaler_functions[scaler](), dtype=torch.float32)
        else:
            self.scaled_values = torch.tensor(self.values, dtype=torch.float32)

    @property
    def n_features(self):
        return self.values.shape[1]

    @property
    def n_catagories(self):
        return self.cat_var_data.shape[1] if self.cat_var_data is not None else 0

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.scaled_values[idx], self.cat_var_data[idx]


In [14]:
class classifier(nn.Module):
    def __init__(self, input_size, dropout, hidden_sizes, output_size):
        super(classifier, self).__init__()
        # Create a list of Linear layers with the specified hidden sizes
        self.hidden_layers = [nn.Linear(input_size, hidden_size) for hidden_size in hidden_sizes]
        # Create a BatchNorm1d layer for each hidden layer
        self.batch_norm_layers = [nn.BatchNorm1d(hidden_size) for hidden_size in hidden_sizes]
        # Create a Dropout layer for each hidden layer
        self.dropout_layers = [nn.Dropout(dropout) for _ in hidden_sizes]
        # Create a Linear layer for the output
        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)
        
    def forward(self, x):
        # Loop through the hidden layers
        for hidden_layer, batch_norm_layer, dropout_layer in zip(self.hidden_layers, self.batch_norm_layers, self.dropout_layers):
            # Apply the hidden layer, batch norm layer, and dropout layer
            x = hidden_layer(x)
            x = batch_norm_layer(x)
            x = F.relu(x)
            x = dropout_layer(x)
        # Apply the output layer and return the output
        x = self.output_layer(x)
        x = F.softmax(x, dim = 1)
        return x

In [15]:
def train_one_epoch(epoch, GEX_dataloader_train, model, optimizer, criterion):
    # Set the model to train mode
    model.train()
    
    # Initialize lists to store the losses and accuracies for each batch
    epoch_loss_list = []
    epoch_accuracy_list = []
    
    # Iterate over the batches in the dataloader
    for batch_idx, (data, target) in enumerate(GEX_dataloader_train):
        # Move the data and target tensors to the specified device (GPU)
        data, target = data.to(device), target.to(device)
        # Clear the gradients of all optimized parameters
        optimizer.zero_grad()
        # Feed the data through the model and get the output
        output = model(data)
        # Calculate the loss using the specified loss function
        loss = criterion(output, target)
        # Calculate the accuracy by comparing the model's predictions to the ground truth labels
        accuracy = (output.argmax(1) == target.argmax(1)).type(torch.float).mean().item()
        # Backpropagate the loss to update the model's parameters
        loss.backward()
        # Update the model's parameters using the optimizer
        optimizer.step()
        # Append the loss and accuracy for this batch to the corresponding lists
        epoch_loss_list.append(loss.item())
        epoch_accuracy_list.append(accuracy)
        
    # Calculate the mean loss and accuracy for the entire epoch
    epoch_loss = np.mean(epoch_loss_list)
    epoch_accuracy = np.mean(epoch_accuracy_list)
    
    # Return the epoch loss and accuracy
    return epoch_loss, epoch_accuracy

            
        

In [16]:
def evaluate_one_epoch(epoch, GEX_Dataset_test, model, optimizer, criterion, confusion = False):
    # Set the model to eval mode
    model.eval()
    # Tell PyTorch not to track gradients while evaluating the model
    with torch.no_grad():
        # Initialize lists to store the losses and accuracies for each batch
        epoch_loss_list = []
        epoch_accuracy_list = []
        
        # Iterate over the batches in the dataloader
        for batch_idx, (data, target) in enumerate(GEX_Dataset_test):
            # Move the data and target tensors to the specified device (GPU)
            data, target = data.to(device), target.to(device)
            # Feed the data through the model and get the output
            output = model(data)
            # Calculate the loss using the specified loss function
            loss = criterion(output, target)
            # Calculate the accuracy by comparing the model's predictions to the ground truth labels
            accuracy = (output.argmax(1) == target.argmax(1)).type(torch.float).mean().item()
            # Append the loss and accuracy for this batch to the corresponding lists
            epoch_loss_list.append(loss.item())
            epoch_accuracy_list.append(accuracy)
            
            if confusion: 
            # Calculate and log the confusion matrix for this batch
                ground_truth_class_ids = target.argmax(1).cpu().numpy()
                predicted_class_ids = output.argmax(1).cpu().numpy()
                wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None, y_true=ground_truth_class_ids, preds=predicted_class_ids, class_names=scdata.obs["batch"].unique())})
        
        # Calculate the mean loss and accuracy for the entire epoch
        epoch_loss = np.mean(epoch_loss_list)
        epoch_accuracy = np.mean(epoch_accuracy_list)
        
    # Return the epoch loss and accuracy
    return epoch_loss, epoch_accuracy


In [17]:
sweep_configuration = {
    'method': 'grid',
    'name': 'sweep',
    'metric': {
        'goal': 'maximize', 
        'name': 'val_acc'
		},
    'parameters': {
        'batch_size': {'value': 512},
        'epochs': {'value' : 500},
        'lr': {'value' : 0.00361},
        'random_seed': {'value': 9000},
        'dropout': {'values': [0,0.1, 0.2, 0.]},
        'hidden_sizes': {'values': [[256, 128, 64], [256, 128, 64], [256, 128, 64], [256, 128, 64]]},
        'split_criteria': {'value': 'cell_type'},
        'eval_size_percentage' : {'value': 0.2}
        
        
}}

sweep_id = wandb.sweep(sweep_configuration, project="Single Cell Omics integration", entity="scintegration")

Create sweep with ID: 6gdsiwyr
Sweep URL: https://wandb.ai/scintegration/Single%20Cell%20Omics%20integration/sweeps/6gdsiwyr


In [18]:
# config = wandb.config = {
#   "lr" : 0.00361,
#   "epochs": 100,
#   "batch_size": 1024,
#   "dropout": 0.2,
#   "hidden_size": 20,
#   "random_seed": 9000,
#   "split_criteria": "batch",
#   "eval_size_percentage" : 0.2,
  
# }

In [19]:
def train_func(config = None):
  
  
    if config is not None:
        config = wandb.config
        run = wandb.init(project="Single Cell Omics integration", entity="scintegration", config=config)
    else:
      run = wandb.init(project="Single Cell Omics integration", entity="scintegration")

    # Load the learning rate, batch size, epochs, random seed, dropout, and hidden size from the wandb configuration
    lr = wandb.config.lr
    batch_size = wandb.config.batch_size
    epochs = wandb.config.epochs
    random_seed = wandb.config.random_seed
    dropout = wandb.config.dropout
    # hidden_size = wandb.config.hidden_size
    split_criteria = wandb.config.split_criteria
    eval_size_percentage = wandb.config.eval_size_percentage
    hidden_sizes = wandb.config.hidden_sizes
    
    # train test split
    train, test = stratified_split(scdata, 0.2, wandb.config.random_seed, split_criteria=split_criteria)
    train_data = scdata[train]
    test_data = scdata[test]
    
    # create datasets
    GEX_Dataset_train = GEX_Dataset(train_data, scaler = "Standard", cat_var = "batch", label_encoder = "one_hot")
    GEX_Dataset_test = GEX_Dataset(test_data, scaler = "Standard", cat_var = "batch", label_encoder = "one_hot")
    
    # Create dataloaders for the training and eval datasets
    GEX_dataloader_train = torch.utils.data.DataLoader(GEX_Dataset_train, batch_size = wandb.config.batch_size, shuffle = True)
    GEX_dataloader_test = torch.utils.data.DataLoader(GEX_Dataset_test, batch_size = wandb.config.batch_size, shuffle = True)
    
    input_size = GEX_Dataset_train.n_features
    output_size = GEX_Dataset_train.n_catagories
    

    # Instantiate the model, optimizer, and criterion outside the for loop
    model = classifier(input_size=input_size, dropout=dropout, hidden_sizes=hidden_sizes, output_size=output_size)
    

    # Move the model to the specified device (e.g. Nvidia GPU, Apple MPS, CPU)
    model.to(device)

    # Instantiate the Adam optimizer with the specified learning rate
    optimizer = optim.Adam(model.parameters(), lr=wandb.config.lr)

    # Instantiate the Cross Entropy loss function
    criterion = nn.CrossEntropyLoss()
    
    best_val_acc = 0
    for epoch in range(1, epochs + 1):
        # Train the model for one epoch
        train_loss, train_acc = train_one_epoch(epoch, GEX_dataloader_train, model, optimizer, criterion)

        # Evaluate the model on the eval dataset
        val_loss, val_acc = evaluate_one_epoch(epoch, GEX_dataloader_test, model, optimizer, criterion)
        
        # If the validation accuracy is the best seen so far, save the model's weights and biases to wandb
        if val_acc > best_val_acc:
          best_val_acc = val_acc
          wandb.save('model_best_val_acc.h5')


        # Log the epoch, train accuracy, train loss, validation accuracy, and validation loss to wandb
        wandb.log({
          'epoch': epoch, 
          'train_acc': train_acc,
          'train_loss': train_loss, 
          'val_acc': val_acc, 
          'val_loss': val_loss
        })


In [20]:
wandb.agent(sweep_id, train_func, count=5)


[34m[1mwandb[0m: Agent Starting Run: evm45n22 with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	epochs: 500
[34m[1mwandb[0m: 	eval_size_percentage: 0.2
[34m[1mwandb[0m: 	hidden_sizes: [256, 128, 64]
[34m[1mwandb[0m: 	lr: 0.00361
[34m[1mwandb[0m: 	random_seed: 9000
[34m[1mwandb[0m: 	split_criteria: cell_type
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meamomc[0m ([33mscintegration[0m). Use [1m`wandb login --relogin`[0m to force relogin


Run evm45n22 errored: RuntimeError('Placeholder storage has not been allocated on MPS device!')
[34m[1mwandb[0m: [32m[41mERROR[0m Run evm45n22 errored: RuntimeError('Placeholder storage has not been allocated on MPS device!')
[34m[1mwandb[0m: Agent Starting Run: 78ex0eiy with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	epochs: 500
[34m[1mwandb[0m: 	eval_size_percentage: 0.2
[34m[1mwandb[0m: 	hidden_sizes: [256, 128, 64]
[34m[1mwandb[0m: 	lr: 0.00361
[34m[1mwandb[0m: 	random_seed: 9000
[34m[1mwandb[0m: 	split_criteria: cell_type
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.214218…

Run 78ex0eiy errored: RuntimeError('Placeholder storage has not been allocated on MPS device!')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 78ex0eiy errored: RuntimeError('Placeholder storage has not been allocated on MPS device!')
[34m[1mwandb[0m: Agent Starting Run: 5226p0v8 with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	epochs: 500
[34m[1mwandb[0m: 	eval_size_percentage: 0.2
[34m[1mwandb[0m: 	hidden_sizes: [256, 128, 64]
[34m[1mwandb[0m: 	lr: 0.00361
[34m[1mwandb[0m: 	random_seed: 9000
[34m[1mwandb[0m: 	split_criteria: cell_type
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.214366…

Run 5226p0v8 errored: RuntimeError('Placeholder storage has not been allocated on MPS device!')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 5226p0v8 errored: RuntimeError('Placeholder storage has not been allocated on MPS device!')
[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
