In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F

from torch.utils.data import DataLoader
import torchvision
from torchvision import datasets, transforms
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import scanpy as sc
import plotly.express as px
import plotly.io as pio
import sklearn.preprocessing
import sklearn.model_selection




torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f966007cf70>

In [2]:
import platform
if platform.platform() == 'macOS-10.16-x86_64-i386-64bit':
    pio.renderers.default = 'notebook'
    device = torch.device('mps')
    print("Using Apple MPS on Macbook Pro")
    gmount = False
    
elif platform.platform() == 'Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic':
    pio.renderers.default = 'colab'
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA GPU on Colab")
        gmount = True

Using Apple MPS on Macbook Pro


In [3]:
scdata = sc.read_h5ad("/Users/eamonmcandrew/Desktop/Single_cell_integration/Data/Multi-ome/GEX.h5ad")

In [4]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meamomc[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
config = wandb.config = {
  "lr" : 0.005818,
  "epochs": 200,
  "batch_size": 256,
  "dropout": 0.2,
  "hidden_size": 30,
  "random_seed": 9000,
  
}

In [6]:
def stratified_split(data, test_size, random_state, split_criteria):
    """
    Splits the data into train and test sets stratified by the batch column
    """
    train = []
    test = []
    for batch in data.obs[split_criteria].unique():
        batch_data = data[data.obs[split_criteria] == batch]
        batch_train, batch_test = sklearn.model_selection.train_test_split(batch_data, test_size=test_size, random_state=random_state)
        batch_train, batch_test = list(batch_train.obs.index), list(batch_test.obs.index)
        train.extend(batch_train)
        test.extend(batch_test)
        
    return train, test

In [7]:
if gmount == True:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/My Drive/Colab Notebooks/Experiments/' 
    scdata = sc.read_h5ad("/content/gdrive/MyDrive/scintegration/GEX.h5ad")

In [8]:
# Use own weights and biases account by adding the Auth token when prompted, can also use key = 'offline' to use offline

import wandb
wandb.login()


True

In [9]:
class GEX_Dataset(torch.utils.data.Dataset):
    def __init__(self, data, scaler=None, cat_var=None, label_encoder=None):
        self.data = data
        self.values = np.asarray(data.X.todense())
        self.cat_var = cat_var

        label_encoder_functions = {
            "numeric": lambda: torch.tensor(sklearn.preprocessing.LabelEncoder().fit_transform(self.data.obs[self.cat_var]), dtype=torch.long),
            "range_map": lambda: sklearn.preprocessing.LabelEncoder().fit_transform(self.data.obs[self.cat_var]).reshape(-1, 1),
            "one_hot": lambda: sklearn.preprocessing.OneHotEncoder().fit_transform(sklearn.preprocessing.LabelEncoder().fit_transform(self.data.obs[self.cat_var]).reshape(-1, 1)).toarray()
        }

        if label_encoder in label_encoder_functions:
            cat_var_data = label_encoder_functions[label_encoder]()
            if label_encoder == "range_map":
                cat_var_data = torch.tensor(sklearn.preprocessing.MinMaxScaler().fit_transform(cat_var_data), dtype=torch.float32)
            elif label_encoder == "one_hot":
                cat_var_data = torch.tensor(cat_var_data, dtype=torch.float32)
        else:
            cat_var_data = None
        self.cat_var_data = cat_var_data

        scaler_functions = {
            "Standard": lambda: sklearn.preprocessing.StandardScaler().fit_transform(self.values),
            "MinMax": lambda: sklearn.preprocessing.MinMaxScaler().fit_transform(self.values)
        }

        if scaler in scaler_functions:
            self.scaled_values = torch.tensor(scaler_functions[scaler](), dtype=torch.float32)
        else:
            self.scaled_values = torch.tensor(self.values, dtype=torch.float32)

    @property
    def n_features(self):
        return self.values.shape[1]

    @property
    def n_catagories(self):
        return self.cat_var_data.shape[1] if self.cat_var_data is not None else 0

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.scaled_values[idx], self.cat_var_data[idx]


In [10]:
class classifier(nn.Module):
    def __init__(self, input_size, dropout, hidden_size, output_size):
        super(classifier, self).__init__()
        self.cfc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.cfc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.cfc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.cfc2(x)
        x = F.softmax(x, dim = 1)
        return(x)

In [11]:
def train_one_epoch(epoch, GEX_dataloader_train, model, optimizer, criterion):
    # Set the model to train mode
    model.train()
    
    # Initialize lists to store the losses and accuracies for each batch
    epoch_loss_list = []
    epoch_accuracy_list = []
    
    # Iterate over the batches in the dataloader
    for batch_idx, (data, target) in enumerate(GEX_dataloader_train):
        # Move the data and target tensors to the specified device (GPU)
        data, target = data.to(device), target.to(device)
        # Clear the gradients of all optimized parameters
        optimizer.zero_grad()
        # Feed the data through the model and get the output
        output = model(data)
        # Calculate the loss using the specified loss function
        loss = criterion(output, target)
        # Calculate the accuracy by comparing the model's predictions to the ground truth labels
        accuracy = (output.argmax(1) == target.argmax(1)).type(torch.float).mean().item()
        # Backpropagate the loss to update the model's parameters
        loss.backward()
        # Update the model's parameters using the optimizer
        optimizer.step()
        # Append the loss and accuracy for this batch to the corresponding lists
        epoch_loss_list.append(loss.item())
        epoch_accuracy_list.append(accuracy)
        
    # Calculate the mean loss and accuracy for the entire epoch
    epoch_loss = np.mean(epoch_loss_list)
    epoch_accuracy = np.mean(epoch_accuracy_list)
    
    # Return the epoch loss and accuracy
    return epoch_loss, epoch_accuracy

            
        

In [12]:
def evaluate_one_epoch(epoch, GEX_Dataset_test, model, optimizer, criterion):
    # Set the model to eval mode
    model.eval()
    # Tell PyTorch not to track gradients while evaluating the model
    with torch.no_grad():
        # Initialize lists to store the losses and accuracies for each batch
        epoch_loss_list = []
        epoch_accuracy_list = []
        
        # Iterate over the batches in the dataloader
        for batch_idx, (data, target) in enumerate(GEX_Dataset_test):
            # Move the data and target tensors to the specified device (GPU)
            data, target = data.to(device), target.to(device)
            # Feed the data through the model and get the output
            output = model(data)
            # Calculate the loss using the specified loss function
            loss = criterion(output, target)
            # Calculate the accuracy by comparing the model's predictions to the ground truth labels
            accuracy = (output.argmax(1) == target.argmax(1)).type(torch.float).mean().item()
            # Append the loss and accuracy for this batch to the corresponding lists
            epoch_loss_list.append(loss.item())
            epoch_accuracy_list.append(accuracy)
            # Log the batch loss and accuracy
            
            # Calculate and log the confusion matrix for this batch
            # ground_truth_class_ids = target.argmax(1).cpu().numpy()
            # predicted_class_ids = output.argmax(1).cpu().numpy()
            # wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None, y_true=ground_truth_class_ids, preds=predicted_class_ids, class_names=scdata.obs["batch"].unique())})
        
        # Calculate the mean loss and accuracy for the entire epoch
        epoch_loss = np.mean(epoch_loss_list)
        epoch_accuracy = np.mean(epoch_accuracy_list)
        
    # Return the epoch loss and accuracy
    return epoch_loss, epoch_accuracy


In [17]:
def train_func(config):
    # Initialize a new run in Weights & Biases (wandb)
    run = wandb.init(project="Single Cell Omics integration", entity="scintegration", notes="Fine_tune_best_classifier", config=config)

    # Load the learning rate, batch size, epochs, random seed, dropout, and hidden size from the wandb configuration
    lr = wandb.config.lr
    batch_size = wandb.config.batch_size
    epochs = wandb.config.epochs
    random_seed = wandb.config.random_seed
    dropout = wandb.config.dropout
    hidden_size = wandb.config.hidden_size
    
    
    # train test split
    train, test = stratified_split(scdata, 0.2, wandb.config.random_seed, split_criteria='cell_type')
    train_data = scdata[train]
    test_data = scdata[test]
    
    # create datasets
    GEX_Dataset_train = GEX_Dataset(train_data, scaler = "Standard", cat_var = "batch", label_encoder = "one_hot")
    GEX_Dataset_test = GEX_Dataset(test_data, scaler = "Standard", cat_var = "batch", label_encoder = "one_hot")
    
    # Create dataloaders for the training and test datasets
    GEX_dataloader_train = torch.utils.data.DataLoader(GEX_Dataset_train, batch_size = wandb.config.batch_size, shuffle = True)
    GEX_dataloader_test = torch.utils.data.DataLoader(GEX_Dataset_test, batch_size = wandb.config.batch_size, shuffle = True)
    
    input_size = GEX_Dataset_train.n_features
    output_size = GEX_Dataset_train.n_catagories
    

    # Instantiate the model, optimizer, and criterion outside the for loop
    model = classifier(input_size=input_size, dropout=dropout, hidden_size=hidden_size, output_size=output_size)

    # Move the model to the specified device (e.g. GPU)
    model.to(device)

    # Instantiate the Adam optimizer with the specified learning rate
    optimizer = optim.Adam(model.parameters(), lr=wandb.config.lr)

    # Instantiate the Cross Entropy loss function
    criterion = nn.CrossEntropyLoss()
    

    for epoch in range(1, epochs + 1):
        # Train the model for one epoch
        train_loss, train_acc = train_one_epoch(epoch, GEX_dataloader_train, model, optimizer, criterion)

        # Evaluate the model on the test dataset
        val_loss, val_acc = evaluate_one_epoch(epoch, GEX_dataloader_test, model, optimizer, criterion)

        # Log the epoch, train accuracy, train loss, validation accuracy, and validation loss to wandb
        wandb.log({
          'epoch': epoch, 
          'train_acc': train_acc,
          'train_loss': train_loss, 
          'val_acc': val_acc, 
          'val_loss': val_loss
        })


In [14]:
train_func()

[34m[1mwandb[0m: Currently logged in as: [33meamomc[0m ([33mscintegration[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
config2 = wandb.config = {
  "lr" : 0.00361,
  "epochs": 100,
  "batch_size": 512,
  "dropout": 0.2,
  "hidden_size": 20,
  "random_seed": 9000,
  
}

In [19]:
train_func(config2)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train epoch accuracy,▁▅▆▆▆▆▆▇▇▇▆▇▇▇▇▆▇▇▇▇▇▇▇▇▇█▇██▆▇██▇▇▇█▇▇█
Train epoch loss,█▄▃▃▃▃▃▂▂▂▃▂▂▂▂▃▂▂▂▂▂▂▂▂▂▁▂▁▁▃▂▁▁▂▂▂▁▂▂▁
Val loss,▇█▇▅▇▆▇▁▆█▄▆▅▃▃▃▅▄▅▂▆▂▄▄▅▃▅▃▃▇▅▃▃▄▅▅▂▆▄▅
Validation epoch accuracy,▃▃▄▄▆▃▅▇▅▆▅▆▆▆▆▅▅▆▆▇▇▆▆▇▆▆▆▆█▁▇▆▆▅▆▆▇▇▆▇
Validation epoch loss,▆▆▅▅▃▆▄▂▄▃▄▃▃▃▃▄▄▃▃▂▂▃▃▂▃▃▃▃▁█▂▃▃▄▃▃▂▂▃▂
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_acc,▁▅▆▆▆▆▆▇▇▇▆▇▇▇▇▆▇▇▇▇▇▇▇▇▇█▇██▆▇██▇▇▇█▇▇█
train_loss,█▄▃▃▃▃▃▂▂▂▃▂▂▂▂▃▂▂▂▂▂▂▂▂▂▁▂▁▁▃▂▁▁▂▂▂▁▂▂▁
val accuracy,▂▁▂▄▂▃▂█▃▁▅▃▄▆▆▆▄▅▄▇▃▇▅▅▄▆▄▆▆▂▄▆▆▅▄▄▇▃▅▄
val_acc,▃▃▄▄▆▃▅▇▅▆▅▆▆▆▆▅▅▆▆▇▇▆▆▇▆▆▆▆█▁▇▆▆▅▆▆▇▇▆▇

0,1
Train epoch accuracy,0.78836
Train epoch loss,1.90069
Val loss,1.84061
Validation epoch accuracy,0.77196
Validation epoch loss,1.91708
epoch,200.0
train_acc,0.78836
train_loss,1.90069
val accuracy,0.84848
val_acc,0.77196


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016721261116663298, max=1.0…

In [20]:
config3 = wandb.config = {
  "lr" : 0.00361,
  "epochs": 100,
  "batch_size": 256,
  "dropout": 0.2,
  "hidden_size": 20,
  "random_seed": 9000,
  
}

In [21]:
train_func(config3)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train epoch accuracy,▁▆▇▇████████████████████████████████████
Train epoch loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Val loss,▇▇▅▆▅▄▃▅▄▅▆▄▄▅▅▅▅▅▅▃▅▄█▅▅▇▆▆▅▄▅▄▅▄▅▄▅▅▁▅
Validation epoch accuracy,▁▄█████▇███▇█████▇▇█▇█████████████████▇█
Validation epoch loss,█▅▁▁▁▁▁▂▁▁▁▂▁▁▁▁▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_acc,▁▆▇▇████████████████████████████████████
train_loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val accuracy,▂▂▄▃▄▄▅▄▄▄▃▄▅▃▄▄▄▄▄▆▄▄▁▄▄▂▃▃▃▄▄▄▄▄▄▅▄▄█▃
val_acc,▁▄█████▇███▇█████▇▇█▇█████████████████▇█

0,1
Train epoch accuracy,0.88941
Train epoch loss,1.79958
Val loss,1.90121
Validation epoch accuracy,0.84713
Validation epoch loss,1.84166
epoch,100.0
train_acc,0.88941
train_loss,1.79958
val accuracy,0.78788
val_acc,0.84713


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670984033339664, max=1.0…