<a href="https://colab.research.google.com/github/FaroukChalghoumi/IndabaX/blob/Transformers/NewTrainingLoop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#@title Download data from GCP bucket
import sys

if 'google.colab' in sys.modules:
  !gsutil -m cp -r gs://indaba-data .
else:
  !mkdir -p indaba-data/train
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train.csv --continue
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train_mut.pt --continue
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train_wt.pt --continue

  !mkdir -p indaba-data/test
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test.csv --continue
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test_mut.pt --continue
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test_wt.pt --continue

Copying gs://indaba-data/test/test_mut.pt...
/ [0 files][    0.0 B/  9.3 MiB]                                                Copying gs://indaba-data/README.txt...
Copying gs://indaba-data/test/test.csv...
/ [0 files][    0.0 B/  9.3 MiB]                                                / [0 files][    0.0 B/  9.6 MiB]                                                Copying gs://indaba-data/test/test_wt.pt...
Copying gs://indaba-data/train/train_mut.pt...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    / [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    Copying gs://indaba-data/train/train.csv...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    Copying gs://indaba-data/train/train_wt.pt...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object dow

In [18]:
#@title Imports and moving to working directory
import torch 
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt


# move to data folder
%cd /content/indaba-data

/content/indaba-data


In [4]:
# Load Embedding tensors & Traing csv
# Embeddings were calculated using the ESM 650M pretrained model 
# Tensor shape of embedded data:  [data_len,1280] 
# There are no sequences in the Embedding tensors as we've performed an average of it (torch.mean(embed, dim=1))
# More details in https://huggingface.co/facebook/esm2_t33_650M_UR50D

wt_emb = torch.load("train/train_wt.pt")
mut_emb = torch.load("train/train_mut.pt")
df = pd.read_csv("train/train.csv")

In [5]:
# [Recommended] Split data into train and validation 
#TODO




# Load the dataset
train_data = pd.read_csv('train/train.csv')

# 1. Extract relevant columns: ID, ddg
relevant_columns = ['ID', 'pdb_id', 'mutation', 'wt_seq', 'mut_seq', 'ddg']
train_data = train_data[relevant_columns]

# 2. Convert amino acid sequences (wt_seq and mut_seq) to numerical representations
# Define the amino acid alphabet
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Convert amino acid sequence to numerical representation
def seq_to_numerical(sequence):
    return np.array([amino_acids.index(aa) for aa in sequence], dtype=np.int64)

# Apply the conversion to wt_seq and mut_seq columns
train_data['wt_seq'] = train_data['wt_seq'].apply(seq_to_numerical)
train_data['mut_seq'] = train_data['mut_seq'].apply(seq_to_numerical)

# 3. Normalize the input data if necessary
# Assuming you want to normalize the ddg column, you can use StandardScaler from sklearn
scaler = StandardScaler()
train_data['ddg'] = scaler.fit_transform(train_data[['ddg']].astype(np.float64))

# 4. Handle missing values, if any
# Assuming you want to drop rows with missing values
train_data.dropna(inplace=True)

# 5. Any other preprocessing steps specific to your use case

# Convert data into PyTorch tensors
#train_mut_tensors = torch.tensor(train_data['mutation'].values)
# Convert data into PyTorch tensors

# Convert data into PyTorch tensors
train_wt_tensors = [torch.tensor(seq) for seq in train_data['wt_seq']]
train_ddg_tensors = torch.tensor(train_data['ddg'].values, dtype=torch.float32)

# Pad sequences to ensure equal length
train_wt_tensors = pad_sequence(train_wt_tensors, batch_first=True)
train_ddg_tensors = train_ddg_tensors.unsqueeze(1)

# Split the data into train and validation sets
train_wt_tensors, val_wt_tensors, train_ddg_tensors, val_ddg_tensors = train_test_split(
    train_wt_tensors, train_ddg_tensors, test_size=0.2, random_state=42)

# Create DataLoader instances for training and validation sets
train_dataset = torch.utils.data.TensorDataset(train_wt_tensors, train_ddg_tensors)
val_dataset = torch.utils.data.TensorDataset(val_wt_tensors, val_ddg_tensors)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Now you have train and validation dataloaders with the preprocessed data
# You can pass these dataloaders to your model for training and evaluation

# Example usage:
# for epoch in range(num_epochs):
#     # Training loop
#     for batch_mut, batch_wt, batch_ddg in train_dataloader:
#         # Training steps

#     # Validation loop
#     for batch_mut, batch_wt, batch_ddg in val_dataloader:
#         # Validation steps



# # Load embedding tensors
# train_mut_embeddings = torch.load('train/train_mut.pt')
# train_wt_embeddings = torch.load('train/train_wt.pt')

# # Load training CSV
# train_data = pd.read_csv('train/train.csv')

# # Splitting the data into train and validation sets
# train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# # Printing the number of samples in each set
# print("Number of samples in the training set:", len(train_data))
# print("Number of samples in the validation set:", len(val_data))

In [6]:
# Building the dataset class
class EmbeddingDataset(torch.utils.data.Dataset):
  def __init__(self,mut_pt, wt_pt, data_df):
    self.pt_mut = mut_pt
    self.pt_wt = wt_pt
    self.df = data_df
  
  def __len__(self):
      return self.pt_mut.shape[0]

  def __getitem__(self, index):
    o1=self.pt_mut[index,:]
    o2=self.pt_wt[index,:]
    if "ddg" in self.df:
      df_out=torch.Tensor([self.df["ddg"][index]])
    else:
      df_out=torch.Tensor([self.df["ID"][index]])
    return  self.pt_mut[index,:],self.pt_wt[index,:],df_out 

In [11]:
# creating training dataset and dataloader
train_dataset = EmbeddingDataset(wt_emb, mut_emb, df)
# preparing a dataloader for the training
train_dataloader = torch.utils.data.dataloader.DataLoader(
        train_dataset,
        batch_size=64,
        shuffle=True,
        num_workers=2,
    )
# [Recommended] Use Data validation loader too


In [12]:
# # Building a simple pytorch model
# # A dummy model (2-param) that demonstrates the usage of the dataset

# class StabilityModel(torch.nn.Module):
#   def __init__(self):
#     super(StabilityModel, self).__init__()
#     self.lin = torch.nn.Linear(1,1)

#   def forward(self, x, y):
#     # run the forward pass
#     # output should be the stability estimation [batch,estim]
#     return self.lin(torch.mean(x-y,dim=1).reshape(-1,1)) 

# class ProteinModel(nn.Module):
#     def __init__(self, input_dim):
#         super(ProteinModel, self).__init__()
#         self.fc1 = nn.Linear(input_dim * 2, 256)
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 1)
#         self.relu = nn.ReLU()
        
#     def forward(self, mut_emb, wt_emb):
#         x = torch.cat((mut_emb, wt_emb), dim=1)
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         x = self.relu(x)
#         output = self.fc3(x)
#         return output.squeeze(1)

class ProteinModel(nn.Module):
    def __init__(self, input_dim):
        super(ProteinModel, self).__init__()
        self.fc1 = nn.Linear(input_dim * 2, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, mut_emb, wt_emb):
        x = torch.cat((mut_emb, wt_emb), dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        output = self.fc5(x)
        return output.squeeze(1)


# Instantiate the model
input_dim = 1280  # Update with the actual input dimension
model = ProteinModel(input_dim)

# Set the model to evaluation mode
model.eval()


ProteinModel(
  (fc1): Linear(in_features=2560, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)

In [16]:
# # Example of training script
# device = torch.device("cuda")
# model =  StabilityModel().to(device)
# optimizer = torch.optim.Adadelta(model.parameters(), lr=0.0001)
# criterion = torch.nn.MSELoss()
# epoch_loss = 0
# for i in range(1):
#   epoch_loss = 0
#   for batch_idx, (data_mut,data_wt , target) in tqdm(enumerate(train_dataloader)):
#       # extract input from datallader
#       x1 = data_wt.to(device)
#       x2 = data_mut.to(device)
#       y = target.to(device)
#       # make prediction
#       y_pred = model(x1,x2)
#       # calculate loss and run optimizer
#       loss = torch.sqrt(criterion(y, y_pred))
#       loss.backward()
#       optimizer.step()
#       epoch_loss += loss
#   print("epoch_",i," = ", epoch_loss/len(train_dataloader))
# #   # [Recommended] Save trained models to select best checkpoint for prediction (or add prediction in the epochs loop)

# # Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# # Define hyperparameters
# learning_rate = 0.001
# batch_size = 32
# num_epochs = 10

# # Create instances of your model and move them to the device
# model = YourModel().to(device)

# # Define the loss function
# loss_fn = nn.MSELoss()

# # Define the optimizer
# learning_rate = 0.001

# model = ProteinModel(input_dim)  # Instantiate the model with the appropriate input dimension
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# # Create DataLoader instances for your training and validation datasets
# train_dataset = EmbeddingDataset(train_mut_embeddings, train_wt_embeddings, train_data)
# train_loader = tqdm(train_dataset, batch_size=batch_size, shuffle=True)

# val_dataset = EmbeddingDataset(train_mut_embeddings, train_wt_embeddings, val_data)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)

# # Training loop
# for epoch in range(num_epochs):
#     # Set model to training mode
#     model.train()

#     for batch in train_loader:
#         # Move batch to device
#         batch = [item.to(device) for item in batch]

#         # Forward pass
#         output = model(batch[0], batch[1])

#         # Compute loss
#         loss = loss_fn(output, batch[2])

#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     # Set model to evaluation mode
#     model.eval()

#     # Validation
#     with torch.no_grad():
#         val_loss = 0.0
#         num_val_samples = 0

#         for batch in val_loader:
#             # Move batch to device
#             batch = [item.to(device) for item in batch]

#             # Forward pass
#             output = model(batch[0], batch[1])

#             # Compute loss
#             val_loss += loss_fn(output, batch[2]).item() * batch[0].size(0)
#             num_val_samples += batch[0].size(0)

#         val_loss /= num_val_samples

#     # Print training and validation loss for each epoch
#     print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}")

# # Save the trained model
# torch.save(model.state_dict(), "trained_model.pt")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = ProteinModel(input_dim=1280).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# criterion = torch.nn.MSELoss()

# num_epochs = 10

# train_losses = []
# val_losses = []

# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0.0
    
#     for batch_idx, (data_mut, data_wt, target) in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch+1}"):
#         data_mut = data_mut.to(device)
#         data_wt = data_wt.to(device)
#         target = target.to(device)
        
#         optimizer.zero_grad()
        
#         # Forward pass
#         output = model(data_mut, data_wt)
        
#         # Compute loss
#         loss = torch.sqrt(criterion(output, target))
        
#         # Backward pass
#         loss.backward()
#         optimizer.step()
        
#         train_loss += loss.item()
    
#     avg_train_loss = train_loss / len(train_dataloader)
#     train_losses.append(avg_train_loss)
    
#     model.eval()
#     val_loss = 0.0
    
#     with torch.no_grad():
#         for batch_idx, (data_mut, data_wt, target) in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc="Validation"):
#             data_mut = data_mut.to(device)
#             data_wt = data_wt.to(device)
#             target = target.to(device)
            
#             output = model(data_mut, data_wt)
#             loss = torch.sqrt(criterion(output, target))
#             val_loss += loss.item()
        
#     avg_val_loss = val_loss / len(test_dataloader)
#     val_losses.append(avg_val_loss)
    
#     print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss}, Validation Loss = {avg_val_loss}")

# # Save the trained model
# torch.save(model.state_dict(), "trained_model.pth")













#####################################################################
# # Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Initialize your improved model
# model = ProteinModel(input_dim=1280).to(device)

# # Define loss function (MSE)
# criterion = nn.MSELoss()

# # Define optimizer with a lower learning rate
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# # Training loop
# num_epochs = 20  # Increase the number of epochs for better training
# best_val_loss = float('inf')
# skip_epochs = 0
# prev_train_loss = None
# prev_val_loss = None

# for epoch in range(num_epochs):
#     train_loss = 0.0
#     val_loss = 0.0
    
#     # Training phase
#     model.train()
#     for data_mut, data_wt, target in train_dataloader:
#         data_mut = data_mut.to(device)
#         data_wt = data_wt.to(device)
#         target = target.to(device)
        
#         optimizer.zero_grad()
#         output = model(data_mut, data_wt)
#         loss = criterion(output, target)
#         loss.backward()
#         optimizer.step()
        
#         train_loss += loss.item() * data_mut.size(0)
    
#     train_loss /= len(train_dataloader.dataset)
    
#     # Validation phase
#     model.eval()
#     with torch.no_grad():
#         for data_mut, data_wt, target in train_dataloader:
#             data_mut = data_mut.to(device)
#             data_wt = data_wt.to(device)
#             target = target.to(device)
            
#             output = model(data_mut, data_wt)
#             loss = criterion(output, target)
            
#             val_loss += loss.item() * data_mut.size(0)
    
#     val_loss /= len(train_dataloader.dataset)
    
#     print(f"Epoch {epoch+1}: Training Loss = {train_loss}, Validation Loss = {val_loss}")
    
#     # Check if training loss and validation loss are the same as the previous epoch
#     if train_loss == prev_train_loss and val_loss == prev_val_loss:
#         skip_epochs += 1
#         if skip_epochs >= 3:
#             print("Training loss and validation loss are not improving. Stopping training.")
#             break
#     else:
#         skip_epochs = 0
    
#     prev_train_loss = train_loss
#     prev_val_loss = val_loss
    
#     # Save the model if validation loss is improved
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), "best_model.pt")



# Instantiate the model
input_dim = 1280  # Update with the actual input dimension
model = ProteinModel(input_dim).to(device)

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define the training loop
num_epochs = 10
best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = 0.0
    val_loss = 0.0
    
    # Training phase
    model.train()
    for data_mut, data_wt, target in train_dataloader:
        data_mut = data_mut.to(device)
        data_wt = data_wt.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(data_mut, data_wt)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * data_mut.size(0)
    
    train_loss /= len(train_dataloader.dataset)
    
    # Validation phase
    model.eval()
    with torch.no_grad():
        for data_mut, data_wt, target in val_dataloader:
            data_mut = data_mut.to(device)
            data_wt = data_wt.to(device)
            target = target.to(device)
            
            output = model(data_mut, data_wt)
            loss = criterion(output, target)
            
            val_loss += loss.item() * data_mut.size(0)
    
    val_loss /= len(val_dataloader.dataset)
    
    print(f"Epoch {epoch+1}: Training Loss = {train_loss}, Validation Loss = {val_loss}")
    
    # Save the model if validation loss is improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")




  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


ValueError: ignored

In [None]:
# Define the learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2)

# Define the training loop
num_epochs = 10
batch_size = 64
accumulation_steps = 8
best_val_loss = float('inf')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loss_history = []
val_loss_history = []

model.train()
for epoch in range(num_epochs):
    train_loss = 0.0
    val_loss = 0.0
    
    for batch_idx, (data_mut, data_wt, target) in enumerate(train_dataloader):
        data_mut = data_mut.to(device)
        data_wt = data_wt.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(data_mut, data_wt)
        loss = criterion(output, target)
        loss.backward()
        
        if (batch_idx + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        train_loss += loss.item() * data_mut.size(0)
    
    train_loss /=len(train_dataloader.dataset)
    # Validation phase
model.eval()
with torch.no_grad():
    for data_mut, data_wt, target in val_dataloader:
        data_mut = data_mut.to(device)
        data_wt = data_wt.to(device)
        target = target.to(device)
        
        output = model(data_mut, data_wt)
        loss = criterion(output, target)
        
        val_loss += loss.item() * data_mut.size(0)

        val_loss /= len(val_dataloader.dataset)

        # Update learning rate scheduler
        scheduler.step(val_loss)

        # Save the model if validation loss is improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pt")

        # Print epoch statistics
        print(f"Epoch {epoch+1}: Training Loss = {train_loss:.4f}, Validation Loss = {val_loss:.4f}")

        # Keep track of loss history
        train_loss_history.append(train_loss)
        val_loss_history.append(val_loss)

        # Check early stopping condition
        if epoch > 0 and val_loss_history[-1] >= val_loss_history[-2]:
            print("Validation loss did not improve. Stopping early.")
        break 

        plt.plot(train_loss_history, label='Training Loss')
        plt.plot(val_loss_history, label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()


## Prediction & submission

In [15]:
import pandas as pd

df_result = pd.DataFrame()
with torch.no_grad():
    for batch_idx, (data_mut, data_wt, target) in tqdm(enumerate(train_dataloader)):
        data_mut = data_mut.to(device)
        data_wt = data_wt.to(device)
        id = target.to(device)
        # Make prediction
        y_pred = model(data_mut, data_wt)
        df_batch = pd.DataFrame({"ID": id.squeeze().cpu().numpy().astype(int), "DDG": y_pred.squeeze().cpu().numpy()})
        df_result = pd.concat([df_result, df_batch])

# Save the DataFrame to a CSV file
df_result.to_csv("predictionsnewwwwww.csv", index=False)

5310it [01:12, 73.60it/s]


In [34]:
# load embedding tensors & traing csv
wt_test_emb = torch.load("test/test_wt.pt")
mut_test_emb = torch.load("test/test_mut.pt")
df_test = pd.read_csv("test/test.csv")

In [35]:
# creating testing dataset and loading the embedding
test_dataset = EmbeddingDataset(wt_test_emb,mut_test_emb,df_test)
# preparing a dataloader for the testing
test_dataloader = torch.utils.data.dataloader.DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=2,
    )

In [36]:
df_result = pd.DataFrame()
with torch.no_grad():
  for batch_idx, (data_mut,data_wt , target) in tqdm(enumerate(test_dataloader)):
    x1 = data_wt.to(device)
    x2 = data_mut.to(device)
    id = target.to(device)
    # make prediction
    y_pred = model(x1,x2)
    df_result = pd.concat([df_result, pd.DataFrame({"ID":id.squeeze().cpu().numpy().astype(int) , "ddg" : y_pred.squeeze().cpu().numpy()})])

60it [00:00, 88.32it/s] 


In [37]:
df_result.to_csv("submission.csv",index=False)