In [None]:
%matplotlib inline

# Imports & Packages

In [None]:
!pip install -U ipywidgets

In [None]:
import os
import re

import numpy as np
import pandas as pd
import ray
import torch
import torch.nn.functional as F
import torch.optim as optim
from ray import train as ray_train
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Constants

In [None]:
# Define Constants

DATA_PATH = "/kaggle/input/deep-learning-for-msc-202324/"
labels_train_path = DATA_PATH + "labels_train.csv"
sample_path = DATA_PATH + "sample.csv"
seqs_test_path = DATA_PATH + "seqs_test.csv"
seqs_train_path = DATA_PATH + "seqs_train.csv"
train_path = DATA_PATH + "train"
test_path = DATA_PATH + "test"

# Define a mapping from amino acid characters to integers
amino_acid_mapping = {
    'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
    'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
    'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
    'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
    'X': 20,  # Typically used for unknown amino acids
    'B': 21,  # Asparagine or Aspartic acid
    'Z': 22,  # Glutamine or Glutamic acid
    'J': 23,  # Leucine or Isoleucine
    '-': 24,  # Gap or padding
}

sec_struct_mapping = {'H': 0, 'E': 1, 'C': 2}

# Dataset Class & Utils

In [None]:
class ProteinDataset(Dataset):
    def __init__(self, csv_file, train_dir, label_file=None, normalize_method='min-max'):

        # Load the sequences
        self.seqs = pd.read_csv(csv_file)

        # Load the protein data from the directory
        self.protein_data = {}
        for filename in os.listdir(train_dir):
            if filename.endswith(".csv"):  # Check if the file is a CSV
                protein_id = re.split(r'_train|_test', filename)[0]
                self.protein_data[protein_id] = pd.read_csv(os.path.join(train_dir, filename))

        # Load the labels, if provided
        if label_file:
            self.labels = pd.read_csv(label_file)
        else:
            self.labels = None

        # Amino acid mapping
        self.amino_acid_mapping = amino_acid_mapping
        self.normalize_method = normalize_method

    def encode_sequence(self, sequence):
        # Convert each amino acid in the sequence to a one-hot encoded vector
        encoded_sequence = np.zeros((len(sequence), len(self.amino_acid_mapping)), dtype=int)
        for i, amino_acid in enumerate(sequence):
            # Default to 'X' for unknown amino acids
            index = self.amino_acid_mapping.get(amino_acid, self.amino_acid_mapping['X'])
            encoded_sequence[i, index] = 1
        return encoded_sequence

    def normalize_pssm(self, pssm):
        # Assuming the first two columns are non-numeric; adjust as necessary based on your actual data format
        numeric_columns = pssm[:, 2:]  # Adjust this if your numeric data starts from a different column

        # Convert to floats
        try:
            pssm_numeric = numeric_columns.astype(np.float32)
        except ValueError as e:
            # Handle or log the error if needed
            raise ValueError(f"Error converting PSSM to float: {e}")

        if self.normalize_method == 'min-max':
            # Min-Max normalization
            pssm_min = pssm_numeric.min(axis=0)
            pssm_max = pssm_numeric.max(axis=0)
            # Ensure no division by zero
            pssm_range = np.where(pssm_max - pssm_min == 0, 1, pssm_max - pssm_min)
            normalized_pssm = (pssm_numeric - pssm_min) / pssm_range
        elif self.normalize_method == 'z-score':
            # Z-Score normalization
            pssm_mean = pssm_numeric.mean(axis=0)
            pssm_std = pssm_numeric.std(axis=0)
            # Avoid division by zero
            pssm_std = np.where(pssm_std == 0, 1, pssm_std)
            normalized_pssm = (pssm_numeric - pssm_mean) / pssm_std
        else:
            # If no normalization method provided, return the original PSSM
            normalized_pssm = pssm_numeric

        return normalized_pssm

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        protein_id = self.seqs.iloc[idx]['PDB_ID']
        sequence = self.seqs.iloc[idx]['SEQUENCE']
        encoded_sequence = self.encode_sequence(sequence)  # Encode the sequence
        pssm = self.protein_data[protein_id].values  # Assuming you will process PSSM separately
        normalized_pssm = self.normalize_pssm(pssm)  # Ensure this is uncommented to use normalized PSSM

        if self.labels is not None:
            label_seq = self.labels.iloc[idx]['SEC_STRUCT']
            label_numeric = [sec_struct_mapping[char] for char in label_seq]
            label_tensor = torch.tensor(label_numeric, dtype=torch.long)
            return (
                protein_id,
                torch.tensor(encoded_sequence, dtype=torch.float32),
                torch.tensor(normalized_pssm, dtype=torch.float32),
                label_tensor
            )

        return (
            protein_id,
            torch.tensor(encoded_sequence, dtype=torch.float32),
            torch.tensor(normalized_pssm, dtype=torch.float32)
        )

In [None]:
def collate_fn(batch):
    _, sequences, pssms, labels_list = zip(*batch)  # Unzip the batch

    # Pad sequences and PSSMs
    sequences_padded = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)

    pssms_padded = pad_sequence([pssm.clone().detach() for pssm in pssms], batch_first=True)

    # Handling labels correctly
    if labels_list[0] is not None:  # Check if labels exist
        labels_padded = pad_sequence([label.clone().detach() for label in labels_list], batch_first=True)

    else:
        labels_padded = None

    return sequences_padded, pssms_padded, labels_padded


# Fully Convolutional Networks (FCNs)

In [None]:
class FullyConvolutionalProteinModel(nn.Module):
    def __init__(
            self,
            num_classes,
            input_channels,
            hidden_layers_number,
            dropout_rate
    ):
        super(FullyConvolutionalProteinModel, self).__init__()

        # List of out_channels for each hidden layer
        self.hidden_layers = self.get_hidden_layers_size(hidden_layers_number)
        self.dropout_rate = dropout_rate

        # Creating convolutional layers dynamically based on 'hidden_layers' input
        self.convs = nn.ModuleList()
        in_channels = input_channels
        for out_channels in self.hidden_layers:
            self.convs.append(
                nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1)
            )
            in_channels = out_channels  # Next layer's in_channels is current layer's out_channels

        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)

        # Final layer that maps to the number of classes
        # The last item of hidden_layers list is used as in_channels here
        self.final_conv = nn.Conv1d(in_channels=self.hidden_layers[-1], out_channels=num_classes, kernel_size=1)

    def forward(self, x):
        # Apply convolutional layers with activation functions and dropout
        for conv in self.convs:
            x = F.relu(conv(x))
            x = self.dropout(x)  # Apply dropout after activation

        # Apply final convolutional layer - no activation, as CrossEntropyLoss includes it
        x = self.final_conv(x)

        # No softmax here, as nn.CrossEntropyLoss applies it internally.
        # Transpose the output to match [batch_size, sequence_length, num_classes]
        x = x.transpose(1, 2)

        return x

    def get_hidden_layers_size(self, number):
        hidden_layers_configs = {
            1: [64],  # one layer with 64 channels
            2: [64, 128],  # two layers with 64 and 128 channels
            3: [64, 128, 256],  # three layers
            4: [64, 128, 256, 512],  # four layers
            5: [64, 128, 256, 512, 1024]  # five layers
#             5: [128, 256, 512, 1024, 512]  # five layers
        }
        return hidden_layers_configs[number]

# Train, Validate & Test

In [None]:
def train_model(model, criterion, optimizer, train_dataloader, num_epochs, input_type, log_precision=False):
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for sequences, pssms, labels in train_dataloader:
            if input_type == "pssms":
                inputs = pssms.permute(0, 2, 1)  # Adjust for PSSM data
            else:
                inputs = sequences.permute(0, 2, 1)  # Adjust for Sequence data

            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.transpose(1, 2), labels)
            loss.backward()
            optimizer.step()

        # After each epoch, calculate precision
        all_predictions, all_labels = [], []
        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            for sequences, pssms, labels in train_dataloader:
#                 inputs = pssms.permute(0, 2, 1).cuda()  # Adjust for data and move to GPU
#                 labels = labels.cuda()
                inputs = pssms.permute(0, 2, 1)
                labels = labels
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 2)
                all_predictions.extend(predicted.view(-1).cpu().numpy())
                all_labels.extend(labels.view(-1).cpu().numpy())
        
        # Calculate precision
        epoch_precision = precision_score(all_labels, all_predictions, average='macro')
        print(f'Epoch {epoch+1}/{num_epochs}, Precision: {epoch_precision:.4f}')

In [None]:
def validate_model(model, criterion, val_dataloader, input_type):
    model.eval()  # Set model to evaluation mode
    all_predictions, all_labels = [], []


    with torch.no_grad():
        for sequences, pssms, labels in val_dataloader:
            if input_type == "pssms":
                inputs = pssms.permute(0, 2, 1)  # Adjust for PSSM data
            else:
                inputs = sequences.permute(0, 2, 1)  # Adjust for PSSM data

            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 2)
            
            all_predictions.extend(predicted.view(-1).cpu().numpy())
            all_labels.extend(labels.view(-1).cpu().numpy())

    val_precision = precision_score(all_labels, all_predictions, average='macro')
    return val_precision

In [None]:
def test_model_direct(model, test_dataset, output_file, input_type):
    model.eval()  # Set the model to evaluation mode
    predictions = []

    with torch.no_grad():
        for i in range(len(test_dataset)):  # Iterate directly over the dataset
            pdb_id, sequence, pssm = test_dataset[i]  # Assuming the dataset returns PDB_ID, sequence, and PSSM

            # Prepare the input tensor; add an extra batch dimension using unsqueeze
            if input_type == "pssms":
                input = pssm.unsqueeze(0).permute(0, 2, 1)  # Adjust dimensions to [1, features, seq_len]
            else:
                input = sequence.unsqueeze(0).permute(0, 2, 1)  # Adjust dimensions to [1, features, seq_len]
            
            # Make a prediction
            outputs = model(input)
            _, predicted = torch.max(outputs, 2)  # Get the index of max log-probability

            # Process the predictions
            seq_len = pssm.shape[0]  # Assuming pssm is [features, seq_len]
            for j in range(seq_len):
                residue_id = f"{pdb_id}_{j + 1}"  # Construct the ID
                structure_label = ['H', 'E', 'C'][predicted[0, j].item()]  # Map numeric predictions to labels
                predictions.append([residue_id, structure_label])

    # Write predictions to CSV
    pd.DataFrame(predictions, columns=['ID', 'STRUCTURE']).to_csv(output_file, index=False)
    print(f'Submission file saved to {output_file}')

In [None]:
def get_optimizer(optimizer_type, model, lr, weight_decay):
    # Choose the optimizer based on the parameterization
    if optimizer_type == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_type == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
    elif optimizer_type == "rmsprop":
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        raise ValueError("Unknown optimizer")

    return optimizer

In [None]:
def train_validate_cv(
        num_folds,
        input_type,
        lr,
        batch_size,
        hidden_layers,
        dropout_rate,
        weight_decay,
        optimizer_type,
        normalization,
        num_epochs,
):
    train_dataset = ProteinDataset(csv_file=seqs_train_path, train_dir=train_path, label_file=labels_train_path,
                                   normalize_method=normalization)

    # Initialize KFold
    kf = KFold(n_splits=num_folds)

    # Convert entire dataset to a list for easier handling
    full_dataset_list = list(range(len(train_dataset)))  # Assumes dataset is your ProteinDataset instance

    # Initialize lists to store results for each fold
    fold_precisions = []
    
    # Cross-validation loop
    for fold, (train_index, val_index) in enumerate(kf.split(full_dataset_list)):
        print(f"Fold {fold + 1}/{num_folds}")

        # Split dataset
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_index)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_index)

        # Create data loaders for training and validation
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler, collate_fn=collate_fn)
        val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler, collate_fn=collate_fn)

        # Initialize model
        if input_type == "pssms":
            input_channels = 20
        else:
            input_channels = 25
        model = FullyConvolutionalProteinModel(
            num_classes=3,
            input_channels=input_channels,
            hidden_layers_number=hidden_layers,
            dropout_rate=dropout_rate
        )
        
        if torch.cuda.device_count() > 1:
            print(f"Let's use {torch.cuda.device_count()} GPUs!")
            model = torch.nn.DataParallel(model)

        model = model.to(device)

        # Initialize loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = get_optimizer(optimizer_type, model, lr, weight_decay)

        # Train and validate the model
        train_model(model, criterion, optimizer, train_loader, num_epochs, input_type)
        fold_precision = validate_model(model, criterion, val_loader, input_type)
        
        fold_precisions.append(fold_precision)

    # Calculate average loss and accuracy across all folds
    avg_precisions = sum(fold_precisions) / num_folds
    print(f"Average Precisions: {avg_precisions:.4f}")
    return avg_precisions

In [None]:
def train_test_full_data(
        input_type,
        lr,
        batch_size,
        hidden_layers,
        dropout_rate,
        weight_decay,
        optimizer_type,
        normalization,
        num_epochs,
        output_file
):
    print("Reading data")
    train_dataset = ProteinDataset(csv_file=seqs_train_path, train_dir=train_path, label_file=labels_train_path,
                                   normalize_method=normalization)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    test_dataset = ProteinDataset(csv_file=seqs_test_path, train_dir=test_path, normalize_method=normalization)

    print("Initializing model")
    if input_type == "pssms":
        input_channels = 20
    else:
        input_channels = 25

    model = FullyConvolutionalProteinModel(
        num_classes=3,
        input_channels=input_channels,
        hidden_layers_number=hidden_layers,
        dropout_rate=dropout_rate
    )

#     if torch.cuda.device_count() > 1:
#         print(f"Let's use {torch.cuda.device_count()} GPUs!")
#         model = torch.nn.DataParallel(model)
    
#     model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = get_optimizer(optimizer_type, model, lr, weight_decay)

    print("Training model")
    train_model(model, criterion, optimizer, train_dataloader, num_epochs, input_type)
    print("Testing model")
    test_model_direct(model, test_dataset, output_file, input_type)

# Main

In [None]:
# train_validate_cv(
#     num_folds=3,
#     input_type="pssms",  # "sequence" or "pssms"
#     lr=0.001,
#     batch_size=4,
#     hidden_layers=3,
#     dropout_rate=0.233246,
#     weight_decay=0.0,
#     optimizer_type='rmsprop',
#     normalization='min-max',
#     num_epochs=10,
# )

In [None]:
# train_test_full_data(
#     input_type="pssms",  # "sequence" or "pssms"
#     lr=0.001,
#     batch_size=4,
#     hidden_layers=3,
#     dropout_rate=0.5,
#     weight_decay=0.0001,
#     optimizer_type='sgd',
#     normalization='min-max',
#     num_epochs=10,
#     output_file='./ax6/submission.csv'
# )

# Hyperparameter Tuning: ax_client

In [None]:
!pip install ax-platform

In [None]:
from ax.service.ax_client import AxClient
from ax.service.utils.instantiation import ObjectiveProperties

import os
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torch._tensor import Tensor
from ax.service.ax_client import AxClient, ObjectiveProperties
from ax.service.utils.report_utils import exp_to_df
from ax.utils.notebook.plotting import init_notebook_plotting, render
from ax.utils.tutorials.cnn_utils import evaluate, load_mnist, train

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train_evaluate(parameterization):
    avg_precision = train_validate_cv(
                                num_folds=3,
                                input_type="pssms",  # "sequence" or "pssms"
                                lr=parameterization["lr"],
                                batch_size=parameterization["batch_size"],
                                hidden_layers=parameterization.get("hidden_layers", 3),
                                dropout_rate=parameterization.get("dropout_rate", 0.5),
                                weight_decay=parameterization.get("weight_decay", 0),
                                optimizer_type=parameterization["optimizer"],
                                normalization=parameterization["normalization"],
                                num_epochs=parameterization.get("epochs", 10),
                            )
    return {"precision": avg_precision}

In [None]:
# Set up the Ax experiment
ax_client = AxClient(enforce_sequential_optimization=False)
ax_client.create_experiment(
    name="protein_model_experiment",
    parameters=[
        {"name": "lr", "type": "range", "bounds": [0.0001, 0.01], "log_scale": True},
        {"name": "batch_size", "type": "fixed", "value": 4},
        {"name": "hidden_layers", "type": "choice", "values": [4, 5]},
        {"name": "dropout_rate", "type": "range", "bounds": [0.0, 0.7]},
        {"name": "weight_decay", "type": "range", "bounds": [0.0, 0.1]},
        {"name": "epochs", "type": "fixed", "value": 10},  # Fixed for all trials, can be changed as needed
        {"name": "optimizer", "type": "fixed", "value": "rmsprop"},  # Add optimizer as a choice
        {"name": "normalization", "type": "fixed", "value": "min-max"},

    ],
#     parameters=[
#         {"name": "lr", "type": "range", "bounds": [0.0001, 0.01], "log_scale": True},
#         {"name": "batch_size", "type": "choice", "values": [4, 16, 32, 64, 128]},
#         {"name": "hidden_layers", "type": "choice", "values": [2, 3, 4, 5]},
#         {"name": "dropout_rate", "type": "range", "bounds": [0.0, 0.7]},
#         {"name": "weight_decay", "type": "range", "bounds": [0.0, 0.1]},
#         {"name": "epochs", "type": "fixed", "value": 10},  # Fixed for all trials, can be changed as needed
#         {"name": "optimizer", "type": "choice", "values": ["adam", "sgd", "rmsprop"]},  # Add optimizer as a choice
#         {"name": "normalization", "type": "choice", "values": ["min-max", "z-score"]},  # Add optimizer as a choice

#     ],
    objectives={"precision": ObjectiveProperties(minimize=False)}
)

# Running the trials
for i in range(20):  # Number of iterations
    params, trial_index = ax_client.get_next_trial()
    metrics = train_evaluate(params)
    ax_client.complete_trial(trial_index=trial_index, raw_data=metrics)

# Fetch the best parameters
best_parameters, metrics = ax_client.get_best_parameters()
print(f'Best Parameters: {best_parameters}')
print(metrics)
best_metrics = metrics['objectives']
print(f'Best Precision: {best_metrics}')

# Feature Selection