In [1]:
import os

os.chdir('../')

print(os.getcwd())

c:\Users\tolayi1\Documents\GitHub\Project


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

import time, random, datasets, evaluate 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.classifier import ModelTrainer

# 1. Load Data

In [3]:
## load data
data = pd.read_csv('./data/data.csv')

# Calculate the maximum length for the sequences
max_antigen_len = max([len(x) for x in data['antigen']])
max_TCR_len = max([len(x) for x in data['TCR']])

# Apply a lambda function to insert spaces between characters
data['antigen'] = data['antigen'].apply(lambda x: ' '.join(list(x)))
data['TCR'] = data['TCR'].apply(lambda x: ' '.join(list(x)))

print(data.shape)
data.head()

(130471, 3)


Unnamed: 0,antigen,TCR,interaction
0,A A G I G I L T V,C A I S E V G V G Q P Q H F,1
1,A A G I G I L T V,C A S S L S F G T E A F F,1
2,A A R A V F L A L,C A S L G A Q N N E Q F,1
3,A A R A V F L A L,C A S S Y S T G D E Q Y F,1
4,A I M D K N I I L,C A S S V D G G S Q P Q H F,1


In [4]:
max_antigen_len, max_TCR_len

(11, 20)

# 2. Tokenize Data

### Method 1:

In [5]:
from transformers import AutoTokenizer
from datasets import Dataset as HFDataset  # Importing Hugging Face Dataset as HFDataset to avoid confusion with PyTorch Dataset
from src.model import BERT_CONFIG

## Tokenizer data
config = BERT_CONFIG
tokenizer = AutoTokenizer.from_pretrained("antigen", config=config)
tokenizer.model_max_length = 64

# Put into Hugging Face dataset
dataset = HFDataset.from_pandas(data)
#dataset = dataset.train_test_split(test_size=0.2)

column_names = data.columns.tolist()

print(f"column names: {column_names}")

def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], examples[column_names[1]], return_special_tokens_mask=False,
                     padding='longest', truncation='longest_first', return_tensors="pt")

tokenized_datasets = dataset.map(
            tokenize_function,
            batched=True,
            #remove_columns=column_names[:2],
            desc="Running tokenizer on dataset"
        )


column names: ['antigen', 'TCR', 'interaction']


Running tokenizer on dataset:   0%|          | 0/130471 [00:00<?, ? examples/s]

In [6]:
tokenized_datasets

Dataset({
    features: ['antigen', 'TCR', 'interaction', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 130471
})

In [7]:
idx = 0
print(tokenized_datasets[idx]['input_ids'])
print(tokenized_datasets[idx]['attention_mask'])
print(tokenized_datasets[idx]['interaction'])
print(len(tokenized_datasets[idx]['input_ids']))

print(f"{tokenized_datasets[idx]['antigen'], tokenized_datasets[idx]['TCR'], tokenized_datasets[idx]['interaction']}")

[1, 14, 14, 5, 12, 5, 12, 16, 6, 23, 2, 7, 14, 12, 17, 11, 23, 5, 23, 5, 8, 24, 8, 21, 20, 2, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
1
33
('A A G I G I L T V', 'C A I S E V G V G Q P Q H F', 1)


In [8]:
tokenizer.decode(tokenized_datasets[idx]["input_ids"])

'[CLS] A A G I G I L T V [SEP] C A I S E V G V G Q P Q H F [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Method 2:

In [16]:
## Tokenizer data
config = BERT_CONFIG
tokenizer = AutoTokenizer.from_pretrained("antigen", config=config)
tokenizer.model_max_length = 64

# Put into Hugging Face dataset
dataset = HFDataset.from_pandas(data)
#dataset = dataset.train_test_split(test_size=0.2)

column_names = data.columns.tolist()

print(f"column names: {column_names}")

# 3 for [CLS], [SEP], [PAD] and spaces between characters in the sequences
max_len = max_antigen_len + max_TCR_len + 3
print(f"max_len: {max_len}")
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], examples[column_names[1]], max_length=max_len, padding='max_length', truncation=True, return_tensors="pt")

tokenized_datasets = dataset.map(
            tokenize_function,
            batched=True,
            #remove_columns=column_names[:2],
            desc="Running tokenizer on dataset"
        )


column names: ['antigen', 'TCR', 'interaction']
max_len: 34


Running tokenizer on dataset:   0%|          | 0/130471 [00:00<?, ? examples/s]

In [17]:
tokenized_datasets

Dataset({
    features: ['antigen', 'TCR', 'interaction', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 130471
})

In [18]:
idx = 50
print(tokenized_datasets[idx]['input_ids'])
print(tokenized_datasets[idx]['attention_mask'])
print(tokenized_datasets[idx]['interaction'])
print(len(tokenized_datasets[idx]['input_ids']))

print(f"{tokenized_datasets[idx]['antigen'], tokenized_datasets[idx]['TCR'], tokenized_datasets[idx]['interaction']}")

[1, 14, 16, 13, 5, 17, 23, 24, 23, 16, 2, 7, 14, 17, 17, 8, 5, 5, 5, 5, 6, 19, 6, 8, 13, 20, 2, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
1
34
('A L Y G S V P V L', 'C A S S Q G G G G T D T Q Y F', 1)


In [19]:
tokenizer.decode(tokenized_datasets[idx]["input_ids"])

'[CLS] A L Y G S V P V L [SEP] C A S S Q G G G G T D T Q Y F [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

# Method 3:

In [20]:
from src.dataPreprocessing import AntigenTCRDataset

In [21]:
dataset = AntigenTCRDataset('./data/data.csv')
print(len(dataset))

130471


In [22]:
idx = 50
print(dataset[idx]['input_ids'])
print(dataset[idx]['attention_mask'])
print(dataset[idx]['interaction'])
print(len(dataset[idx]['input_ids']), len(dataset[idx]['attention_mask']))
print(dataset[idx]['antigen'], dataset[idx]['tcr'])
print(dataset[idx]['combined_sequences'])

tensor([ 1, 14, 16, 13,  5, 17, 23, 24, 23, 16,  2,  7, 14, 17, 17,  8,  5,  5,
         5,  5,  6, 19,  6,  8, 13, 20,  2,  0,  0,  0,  0,  0,  0,  0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
tensor(1)
34 34
ALYGSVPVL CASSQGGGGTDTQYF
[CLS] A L Y G S V P V L [SEP] C A S S Q G G G G T D T Q Y F [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [23]:
train_dataset, test_dataset = dataset.split_data(test_size=0.2, state=48)
len(train_dataset), len(test_dataset)

(104376, 26095)

In [24]:
len(train_dataset[0]['input_ids']), len(train_dataset[0]['attention_mask']), (train_dataset[0]['interaction'])

(34, 34, tensor(1))

In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

# # Step 3: Create a DataLoader
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Iterate Over the DataLoader
for batch in dataloader:
    batch_input_ids = batch['input_ids']
    batch_attention_mask = batch['attention_mask']
    batch_label = batch['interaction']
    print(batch_input_ids)
    print([len(item) for item in batch_input_ids])
    print(batch_attention_mask)
    print([len(item) for item in batch_attention_mask])
    print(batch_label)
    print(f'Length: @input_ids - {len(batch_input_ids)}, @attention_mask - {len(batch_attention_mask)}, @interaction - {len(batch_label)}')
    break


tensor([[ 1, 11, 16,  ...,  0,  0,  0],
        [ 1, 16, 16,  ...,  0,  0,  0],
        [ 1, 20, 15,  ...,  0,  0,  0],
        ...,
        [ 1, 20, 16,  ...,  0,  0,  0],
        [ 1, 22, 16,  ...,  0,  0,  0],
        [ 1,  9, 16,  ...,  0,  0,  0]])
[34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34]
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
[34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34]
tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 1])
Length: @input_ids - 32, @attention_mask - 32, @interaction - 32


In [26]:
len(dataloader.sampler)

104376

[CLS]antigen[SEP]TCR[EOS]

# 3. Model

In [27]:
# from src.classifier import ModelTrainer
import sys
import numpy as np
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

from src.model import clf_loss_func, TCRModel # model, loss function

# """
#     package versions:
#         torch: 2.1.1+cu121
#         transformers: 4.35.2
#         sklearn: 1.3.0
#         logging: 0.5.1.2
# """

# key reference: 
#               https://github.com/aws-samples/amazon-sagemaker-protein-classification/blob/main/code/train.py
#               https://medium.com/analytics-vidhya/bert-pre-training-fine-tuning-eb574be614f6
#               https://medium.com/dataseries/k-fold-cross-validation-with-pytorch-and-sklearn-d094aa00105f
class ModelTrainer(nn.Module):

    """
        ************** Train/Test the model using cross validation ************** 
        seed: seed for random number generator
        epochs: number of epochs to train
        lr: learning rate
        train: flag whether to train the model
        log_interval: how many batches to wait before logging training status
        model: takes input_ids: str, attention_mask: str, classification: bool
        
    """

    def __init__(self, model=TCRModel(), seed = 2023, lr=2e-5, epochs=1000, log_interval=10):
        super(ModelTrainer, self).__init__()
        self.seed = seed 
        self.epochs = epochs 
        self.lr = lr    
        self.log_interval = log_interval 
        self.device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
        self.model = model.to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.loss_func = clf_loss_func #FocalLoss(gamma=3, alpha=0.25, no_agg=True)    

    def validate(self, val_loader, model, device, loss_func):
        """Evaluate the network on the entire validation (part of training data) set."""

        val_loss, val_accuracy = 0, 0
        all_labels = []
        all_predictions = []

        model.eval()
        with torch.no_grad():

            for data in val_loader:
                # get the inputs
                input_ids = data['input_ids'].to(device)
                input_mask = data['attention_mask'].to(device)
                labels = data['interaction'].to(device)
                # forward pass
                outputs = model(input_ids=input_ids, attention_mask=input_mask)
                # loss and accuracy
                loss = loss_func(input=outputs, target=labels)
                val_loss += loss.sum().item() * input_ids.size(0)
                scores, predictions = torch.max(outputs, dim=1)
                val_accuracy += (predictions == labels).sum().item()

                # Store predictions and labels for AUC calculation
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(outputs.cpu().detach().numpy()[:, 1])
            
        # Compute AUC
        auc = roc_auc_score(all_labels, all_predictions)


        return val_loss, val_accuracy, auc

    def test(self, test_loader, model, loss_func, device):
        """Evaluate the network on the entire test set and calculate AUC."""

        model.eval()

        test_loss, test_accuracy = 0, 0
        all_labels = []
        all_predictions = []

        with torch.no_grad():
            for data in test_loader:
                # get the inputs
                input_ids = data['input_ids'].to(device)
                input_mask = data['attention_mask'].to(device)
                labels = data['interaction'].to(device)

                # forward pass
                outputs = model(input_ids=input_ids, attention_mask=input_mask)

                # loss and accuracy
                loss = loss_func(input=outputs, target=labels)
                test_loss += loss.sum().item() * input_ids.size(0)

                scores, predictions = torch.max(outputs, dim=1)
                test_accuracy += (predictions == labels).sum().item()

                # Store predictions and labels for AUC calculation
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(outputs.cpu().detach().numpy()[:, 1])  # Assuming binary classification (1 is the positive class)

        # Compute AUC
        auc = roc_auc_score(all_labels, all_predictions)

        return test_loss, test_accuracy, auc                  
    

    def train(self, model, train_loader, loss_func, optimizer, device):
        """Train the network on the training set."""
        train_loss, train_accuracy = 0, 0
        all_labels = []
        all_predictions = []

        model.train()
        
        for data in train_loader:
            
            # get the inputs
            input_ids = data['input_ids'].to(device)   # amino acid index numbers
            input_mask = data['attention_mask'].to(device) # attention mask (1 for non-padding token and 0 for padding)
            labels = data['interaction'].to(device) # True for classification task
            # forward pass
            outputs = self.model(input_ids = input_ids, attention_mask = input_mask)
            # loss and backward pass
            loss = self.loss_func(input=outputs, target=labels)
            loss.mean().backward()
            optimizer.step()
            optimizer.zero_grad()
            # loss and accuracy
            train_loss += loss.sum().item() * input_ids.size(0)
            scores, predictions = torch.max(outputs, dim=1)
            train_accuracy += (predictions == labels).sum().item()

            # auc score
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(outputs.cpu().detach().numpy()[:, 1])

        # Compute AUC
        auc = roc_auc_score(all_labels, all_predictions)

        return train_loss, train_accuracy, auc

    def execute_run(self, train_loader, test_loader, fold = 3, batch_size = 32):
        '''Train, Test and Validate the network on the training set using cross validation.'''

        print(f"Training on: {self.device}")

        torch.manual_seed(self.seed) # set the seed for generating random numbers

        if torch.cuda.is_available():
            torch.cuda.manual_seed(self.seed)
        
        # split data for K-fold cross validation to avoid overfitting
        self.fold = fold
        indices = list(range(len(train_loader.dataset)))
        kf = KFold(n_splits=self.fold, shuffle=True)

        for cv_index, (train_indices, valid_indices) in enumerate(kf.split(indices)):

            train_sampler = SubsetRandomSampler(train_indices)
            valid_sampler = SubsetRandomSampler(valid_indices)

            train_loader = DataLoader(train_loader.dataset, batch_size=batch_size,
                                                       sampler=train_sampler,
                                                       shuffle=False, pin_memory=True)
            val_loader = DataLoader(train_loader.dataset, batch_size=batch_size,
                                                     sampler=valid_sampler,
                                                     shuffle=False, pin_memory=True)
            
            print("CV: {}".format(cv_index))

            self.history = {'train_loss': [], 'val_loss': [],'train_acc':[],'val_acc':[]}

            for epoch in range(0, self.epochs + 1):
                # Training
                epoch_train_loss, epoch_train_accuracy, auc_train = self.train(model=self.model, 
                                                                    train_loader=train_loader, loss_func=self.loss_func, 
                                                                    optimizer=self.optimizer, device=self.device)
                # Validation
                epoch_val_loss, epoch_val_accuracy, auc_val = self.validate(val_loader=val_loader, 
                                                                    model=self.model, loss_func=self.loss_func, 
                                                                    device=self.device)
                # 
                train_loss = epoch_train_loss / len(train_loader.sampler)
                train_accuracy = epoch_train_accuracy * 100 / len(train_loader.sampler)
                val_loss = epoch_val_loss / len(val_loader.sampler)
                val_accuracy = epoch_val_accuracy * 100/ len(val_loader.sampler)

                self.history['train_loss'].append(train_loss)    
                self.history['train_acc'].append(train_accuracy)
                self.history['val_loss'].append(val_loss)
                self.history['val_acc'].append(val_accuracy)

                # train & validation error after every epoch
                print("Epoch: {}/{}, Training Loss: {:.4f}, Training Accuracy: {:.2f} %, Train AUC score: {:.2f}, Validation Loss: {:.4f}, Validation Accuracy: {:.2f} %, Validation AUC score: {:.2f}".format(
                                epoch, self.epochs, train_loss, train_accuracy, auc_train, val_loss, val_accuracy, auc_val))
        
        # model testing
        print('Testing the model...')
        test_loss, test_accuracy, auc_test = self.test(test_loader=test_loader, model=self.model, loss_func=self.loss_func, device=self.device)
        test_loss_, test_accuracy_ = test_loss / len(test_loader.sampler), test_accuracy * 100 / len(test_loader.sampler)

        print("Test Loss: {:.4f}, Test Accuracy: {:.2f} %, Test AUC score".format(test_loss_, test_accuracy_, auc_test))
        print('Finished training & testing the model.')

    def save(self, path):
        """Save the model to the path specified."""
        # save model
        self.model.save(f"{path}.pt")
        # save history
        avg_train_loss = np.mean(self.history['train_loss'])
        avg_val_loss = np.mean(self.history['val_loss'])
        avg_train_acc = np.mean(self.history['train_acc'])
        avg_val_acc = np.mean(self.history['val_acc'])

        print('Performance of {} fold cross validation'.format(self.fold))
        print("Average Training Loss: {:.4f} \t Average Val Loss: {:.4f} \t Average Training Acc: {:.3f} \t Average Val Acc: {:.3f}".format(avg_train_loss, avg_val_loss,avg_train_acc,avg_val_acc))  
        
        np.save(f'{path}_history.npy', self.history)


In [28]:
# timer
start_time = time.time()
# dataset
batch_size = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
# model

epoch=1
Model = ModelTrainer(epochs=epoch, lr=1e-3)
Model.execute_run(train_loader=train_loader, test_loader=test_loader, batch_size=batch_size, fold=3)

end_time = time.time()

print(f"Total time taken: {round((end_time - start_time)/60, 2)} mins")

Training on: cuda
CV: 0
Epoch: 0/1, Training Loss: 5.3285, Training Accuracy: 49.49 %, Train AUC score: 0.50, Validation Loss: 4.1913, Validation Accuracy: 74.97 %, Train AUC score: 0.49
Epoch: 1/1, Training Loss: 4.2680, Training Accuracy: 50.04 %, Train AUC score: 0.50, Validation Loss: 4.1612, Validation Accuracy: 25.03 %, Train AUC score: 0.50
CV: 1
Epoch: 0/1, Training Loss: 4.2135, Training Accuracy: 50.26 %, Train AUC score: 0.50, Validation Loss: 4.1460, Validation Accuracy: 24.58 %, Train AUC score: 0.50
Epoch: 1/1, Training Loss: 4.2034, Training Accuracy: 49.23 %, Train AUC score: 0.50, Validation Loss: 4.1361, Validation Accuracy: 24.58 %, Train AUC score: 0.50
CV: 2
Epoch: 0/1, Training Loss: 4.1744, Training Accuracy: 51.00 %, Train AUC score: 0.50, Validation Loss: 4.1764, Validation Accuracy: 25.32 %, Train AUC score: 0.50
Epoch: 1/1, Training Loss: 4.1758, Training Accuracy: 50.58 %, Train AUC score: 0.50, Validation Loss: 4.1748, Validation Accuracy: 25.32 %, Train AU