In [11]:
# Import necessary libraries

import os
import re

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split

# Define store file paths
DATA_PATH = "./data/"
labels_train_path = DATA_PATH + "labels_train.csv"
sample_path = DATA_PATH + "sample.csv"
seqs_test_path = DATA_PATH + "seqs_test.csv"
seqs_train_path = DATA_PATH + "seqs_train.csv"
train_path = DATA_PATH + "train"
test_path = DATA_PATH + "test"

# Define a mapping from amino acid characters to integers
amino_acid_mapping = {
    'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
    'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
    'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
    'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
    'X': 20,  # Typically used for unknown amino acids
    'B': 21,  # Asparagine or Aspartic acid
    'Z': 22,  # Glutamine or Glutamic acid
    'J': 23,  # Leucine or Isoleucine
    '-': 24,  # Gap or padding
}

sec_struct_mapping = {'H': 0, 'E': 1, 'C': 2}  # Add more mappings if there are more labels

# Check if a CUDA-enabled GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class ProteinDataset(Dataset):
    def __init__(self, csv_file, train_dir, label_file=None, normalize_method='min-max'):

        # Load the sequences
        self.seqs = pd.read_csv(csv_file)

        # Load the protein data from the directory
        self.protein_data = {}
        for filename in os.listdir(train_dir):
            if filename.endswith(".csv"):  # Check if the file is a CSV
                protein_id = re.split(r'_train|_test', filename)[0]
                self.protein_data[protein_id] = pd.read_csv(os.path.join(train_dir, filename))

        # Load the labels, if provided
        if label_file:
            self.labels = pd.read_csv(label_file)
        else:
            self.labels = None

        # Amino acid mapping
        self.amino_acid_mapping = amino_acid_mapping
        self.normalize_method = normalize_method

    def encode_sequence(self, sequence):
        # Convert each amino acid in the sequence to a one-hot encoded vector
        encoded_sequence = np.zeros((len(sequence), len(self.amino_acid_mapping)), dtype=int)
        for i, amino_acid in enumerate(sequence):
            # Default to 'X' for unknown amino acids
            index = self.amino_acid_mapping.get(amino_acid, self.amino_acid_mapping['X'])
            encoded_sequence[i, index] = 1
        return encoded_sequence

    def normalize_pssm(self, pssm):
        # Assuming the first two columns are non-numeric; adjust as necessary based on your actual data format
        numeric_columns = pssm[:, 2:]  # Adjust this if your numeric data starts from a different column

        # Convert to floats
        try:
            pssm_numeric = numeric_columns.astype(np.float32)
        except ValueError as e:
            # Handle or log the error if needed
            raise ValueError(f"Error converting PSSM to float: {e}")

        if self.normalize_method == 'min-max':
            # Min-Max normalization
            pssm_min = pssm_numeric.min(axis=0)
            pssm_max = pssm_numeric.max(axis=0)
            # Ensure no division by zero
            pssm_range = np.where(pssm_max - pssm_min == 0, 1, pssm_max - pssm_min)
            normalized_pssm = (pssm_numeric - pssm_min) / pssm_range
        elif self.normalize_method == 'z-score':
            # Z-Score normalization
            pssm_mean = pssm_numeric.mean(axis=0)
            pssm_std = pssm_numeric.std(axis=0)
            # Avoid division by zero
            pssm_std = np.where(pssm_std == 0, 1, pssm_std)
            normalized_pssm = (pssm_numeric - pssm_mean) / pssm_std
        else:
            # If no normalization method provided, return the original PSSM
            normalized_pssm = pssm_numeric

        return normalized_pssm

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        protein_id = self.seqs.iloc[idx]['PDB_ID']
        sequence = self.seqs.iloc[idx]['SEQUENCE']
        encoded_sequence = self.encode_sequence(sequence)  # Encode the sequence
        pssm = self.protein_data[protein_id].values  # Assuming you will process PSSM separately
        normalized_pssm = self.normalize_pssm(pssm)  # Ensure this is uncommented to use normalized PSSM

        if self.labels is not None:
            label_seq = self.labels.iloc[idx]['SEC_STRUCT']
            label_numeric = [sec_struct_mapping[char] for char in label_seq]
            label_tensor = torch.tensor(label_numeric, dtype=torch.long)
            return (
                protein_id,
                torch.tensor(encoded_sequence, dtype=torch.float32),
                torch.tensor(normalized_pssm, dtype=torch.float32),
                label_tensor
            )

        return (
            protein_id,
            torch.tensor(encoded_sequence, dtype=torch.float32),
            torch.tensor(normalized_pssm, dtype=torch.float32)
        )


def collate_fn_without_labels(batch):
    id, sequences, pssms = zip(*batch)  # Unzip the batch

    # Pad sequences and PSSMs
    sequences_padded = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)
    # sequences_padded =  torch.tensor([seq.clone().detach() for seq in sequences])
    pssms_padded = pad_sequence([pssm.clone().detach() for pssm in pssms], batch_first=True)

#     pssms_padded = torch.tensor(pssms)
    # pssms_padded = pad_sequence([pssm.clone().detach() for pssm in pssms], batch_first=True)

    return id, sequences_padded, pssms_padded


def collate_fn(batch):
    _, sequences, pssms, labels_list = zip(*batch)  # Unzip the batch

    # Pad sequences and PSSMs
    sequences_padded = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)

    pssms_padded = pad_sequence([pssm.clone().detach() for pssm in pssms], batch_first=True)

    # Handling labels correctly
    if labels_list[0] is not None:  # Check if labels exist
        labels_padded = pad_sequence([label.clone().detach() for label in labels_list], batch_first=True)

    else:
        labels_padded = None

    # Create a mask based on the original sequence lengths
    mask = [torch.ones(len(label), dtype=torch.uint8) for label in labels_list]
    mask_padded = pad_sequence(mask, batch_first=True, padding_value=0)  # Assuming padding_value for labels is 0
    return sequences_padded, pssms_padded, labels_padded, mask_padded


class FullyConvolutionalProteinModel(nn.Module):
    def __init__(self, num_classes=3, input_channels=20):  # 20 for amino acid one-hot, adjust if using PSSM
        super(FullyConvolutionalProteinModel, self).__init__()

        # Define convolutional layers
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)

        # Final layer that maps to the number of classes
        self.final_conv = nn.Conv1d(in_channels=256, out_channels=num_classes, kernel_size=1)

    def forward(self, x):
        # Apply convolutional layers with activation functions
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        # Apply final convolutional layer - no activation, as CrossEntropyLoss includes it
        x = self.final_conv(x)

        # No softmax here, as nn.CrossEntropyLoss applies it internally.
        # Transpose the output to match [batch_size, sequence_length, num_classes]
        # This makes it easier to calculate loss later
        x = x.transpose(1, 2)

        return x

class ProteinModelTrainer:
    def __init__(self, model, criterion, optimizer, train_dataset, val_dataset, test_dataset, batch_size=64):
        self.model = model.to(device)  # Move the model to the device
        self.criterion = criterion.to(device)  # Move the loss function to the device
        self.optimizer = optimizer
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.batch_size = batch_size

        self.train_loader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        self.val_loader = DataLoader(self.val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        self.test_loader = DataLoader(self.test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_without_labels)

    def train_model(self, num_epochs):
        for epoch in range(num_epochs):
            self.model.train()  # Set model to training mode
            running_loss = 0.0
            correct_preds = 0
            total_preds = 0

            for sequences, pssms, labels, _ in self.train_loader:
                inputs = pssms.permute(0, 2, 1).to(device)  # Move input to device
                labels = labels.to(device)  # Move labels to device

                self.optimizer.zero_grad()

                outputs = self.model(inputs)
                loss = self.criterion(outputs.transpose(1, 2), labels)

                loss.backward()
                self.optimizer.step()

                running_loss += loss.item() * inputs.size(0)

                # Calculate training accuracy
                _, predicted = torch.max(outputs, 2)  # Get the index of the max log-probability
                correct_preds += (predicted == labels).sum().item()
                total_preds += labels.numel()

            epoch_loss = running_loss / len(self.train_loader.dataset)
            epoch_acc = correct_preds / total_preds
            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

    def validate_model(self):
        self.model.eval()  # Set model to evaluation mode
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        with torch.no_grad():
            for sequences, pssms, labels, _ in self.val_loader:
                inputs = pssms.permute(0, 2, 1).to(device)  # Move input to device
                labels = labels.to(device)  # Move labels to device

                outputs = self.model(inputs)
                loss = self.criterion(outputs.transpose(1, 2), labels)

                running_loss += loss.item() * inputs.size(0)

                # Calculate accuracy
                _, predicted = torch.max(outputs, 2)
                correct_preds += (predicted == labels).sum().item()
                total_preds += labels.numel()

        val_loss = running_loss / len(self.val_loader.dataset)
        val_acc = correct_preds / total_preds
        print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

    def test_model(self):
        self.model.eval()  # Set model to evaluation mode
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        with torch.no_grad():
            for sequences, pssms, labels, *_ in self.test_loader:
                inputs = pssms.permute(0, 2, 1).to(device)  # Move input to device
                labels = labels.to(device)  # Move labels to device

                outputs = self.model(inputs)
                loss = self.criterion(outputs.transpose(1, 2), labels)

                running_loss += loss.item() * inputs.size(0)

                # Calculate accuracy
                _, predicted = torch.max(outputs, 2)
                correct_preds += (predicted == labels).sum().item()
                total_preds += labels.numel()

        test_loss = running_loss / len(self.test_loader.dataset)
        test_acc = correct_preds / total_preds
        print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

    def test_model_direct(self, output_file='./submission.csv'):
        self.model.eval()  # Set the model to evaluation mode
        predictions = []

        with torch.no_grad():
            for i in range(len(self.test_dataset)):  # Iterate directly over the dataset
                pdb_id, _, pssm = self.test_dataset[i]  # Assuming the dataset returns PDB_ID, sequence, and PSSM

                # Prepare the input tensor; add an extra batch dimension using unsqueeze
                input_pssm = pssm.unsqueeze(0).permute(0, 2, 1).to(device)  # Move input to device

                # Make a prediction
                outputs = self.model(input_pssm)
                _, predicted = torch.max(outputs, 2)  # Get the index of max log-probability

                # Process the predictions
                seq_len = pssm.shape[0]  # Assuming pssm is [features, seq_len]
                for j in range(seq_len):
                    residue_id = f"{pdb_id}_{j + 1}"  # Construct the ID
                    structure_label = ['H', 'E', 'C'][predicted[0, j].item()]  # Map numeric predictions to labels
                    predictions.append([residue_id, structure_label])

        # Write predictions to CSV
        pd.DataFrame(predictions, columns=['ID', 'STRUCTURE']).to_csv(output_file, index=False)
        print(f'Submission file saved to {output_file}')

# Create datasets
dataset = ProteinDataset(csv_file=seqs_train_path, train_dir=train_path, label_file=labels_train_path)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_subset, val_subset = random_split(dataset, [train_size, val_size])

test_dataset = ProteinDataset(csv_file=seqs_test_path, train_dir=test_path)

# Create model, loss function, and optimizer
model = FullyConvolutionalProteinModel()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0)

# Create trainer instance
trainer = ProteinModelTrainer(model, criterion, optimizer, train_subset, val_subset, test_dataset, batch_size=64)

# Train and evaluate the model
num_epochs = 25
trainer.train_model(num_epochs)
trainer.validate_model()
# trainer.test_model()
trainer.test_model_direct()

Using device: cuda
Epoch 1/25, Loss: 0.2478, Accuracy: 0.9011
Epoch 2/25, Loss: 0.2023, Accuracy: 0.9192
Epoch 3/25, Loss: 0.1982, Accuracy: 0.9215
Epoch 4/25, Loss: 0.1857, Accuracy: 0.9263
Epoch 5/25, Loss: 0.1801, Accuracy: 0.9284
Epoch 6/25, Loss: 0.1736, Accuracy: 0.9309
Epoch 7/25, Loss: 0.1766, Accuracy: 0.9306
Epoch 8/25, Loss: 0.1759, Accuracy: 0.9310
Epoch 9/25, Loss: 0.1739, Accuracy: 0.9313
Epoch 10/25, Loss: 0.1768, Accuracy: 0.9310
Epoch 11/25, Loss: 0.1724, Accuracy: 0.9324
Epoch 12/25, Loss: 0.1717, Accuracy: 0.9324
Epoch 13/25, Loss: 0.1693, Accuracy: 0.9335
Epoch 14/25, Loss: 0.1654, Accuracy: 0.9350
Epoch 15/25, Loss: 0.1662, Accuracy: 0.9349
Epoch 16/25, Loss: 0.1667, Accuracy: 0.9347
Epoch 17/25, Loss: 0.1642, Accuracy: 0.9356
Epoch 18/25, Loss: 0.1593, Accuracy: 0.9371
Epoch 19/25, Loss: 0.1604, Accuracy: 0.9365
Epoch 20/25, Loss: 0.1641, Accuracy: 0.9359
Epoch 21/25, Loss: 0.1615, Accuracy: 0.9368
Epoch 22/25, Loss: 0.1612, Accuracy: 0.9372
Epoch 23/25, Loss: 0.1

In [None]:
import os
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from ax.service.ax_client import AxClient
from ax.service.utils.instantiation import ObjectiveProperties
from ax.utils.tutorials.cnn_utils import evaluate, load_mnist, train
from sklearn.model_selection import KFold

def train_validate_cv(model, x_train, y_train, params, n_folds=5):
  """
  Performs K-Fold cross-validation to evaluate model performance.

  Args:
      model: ProteinModelTrainer object representing the protein structure prediction model.
      x_train: Training data (protein sequences).
      y_train: Training labels (protein structures).
      params: Dictionary containing hyperparameter values for the model.
      n_folds: Number of folds for cross-validation (default: 5).

  Returns:
      Average precision score across all folds.
  """

  # Define KFold object
  kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

  # Initialize variables for tracking performance
  total_precision = 0

  # Perform cross-validation
  for train_index, val_index in kf.split(x_train):
    x_train_fold, x_val_fold = x_train[train_index], x_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train model on the current fold with specified hyperparameters
    model.train(x_train_fold, y_train_fold, params=params)

    # Evaluate model precision on the validation fold
    precision = model.evaluate(x_val_fold, y_val_fold)["precision"]

    # Accumulate precision across folds
    total_precision += precision

  # Calculate average precision
  avg_precision = total_precision / n_folds

  return avg_precision


def train_evaluate(parameterization):
    avg_precision = train_validate_cv(
        num_folds=3,
        input_type="pssms",  # "sequence" or "pssms"
        lr=parameterization["lr"],
        batch_size=parameterization["batch_size"],
        hidden_layers=parameterization.get("hidden_layers", 3),
        dropout_rate=parameterization.get("dropout_rate", 0.5),
        weight_decay=parameterization.get("weight_decay", 0),
        optimizer_type=parameterization["optimizer"],
        normalization=parameterization["normalization"],
        num_epochs=parameterization.get("epochs", 10),
    )
    return {"precision": avg_precision}

# Set up the Ax experiment
ax_client = AxClient(enforce_sequential_optimization=False)
ax_client.create_experiment(
    name="protein_model_experiment",
    parameters=[
        {"name": "lr", "type": "range", "bounds": [0.0001, 0.01], "log_scale": True},
        {"name": "batch_size", "type": "fixed", "value": 4},
        {"name": "hidden_layers", "type": "choice", "values": [4, 5]},
        {"name": "dropout_rate", "type": "range", "bounds": [0.0, 0.7]},
        {"name": "weight_decay", "type": "range", "bounds": [0.0, 0.1]},
        {"name": "epochs", "type": "fixed", "value": 10},  # Fixed for all trials, can be changed as needed
        {"name": "optimizer", "type": "fixed", "value": "rmsprop"},  # Add optimizer as a choice
        {"name": "normalization", "type": "fixed", "value": "min-max"},
    ],
    objectives={"precision": ObjectiveProperties(minimize=False)}
)

# Running the trials
for i in range(20):  # Number of iterations
    params, trial_index = ax_client.get_next_trial()
    metrics = train_evaluate(params)
    ax_client.complete_trial(trial_index=trial_index, raw_data=metrics)

# Fetch the best parameters
best_parameters, metrics = ax_client.get_best_parameters()
print(f'Best Parameters: {best_parameters}')
print(metrics)
best_metrics = metrics['objectives']
print(f'Best Precision: {best_metrics}')

# From base 

In [None]:
# seqs, protein_data = load_protein_data(seqs_train_path, train_path)
# train_dataset = [prepare_data_point(idx, seqs, protein_data, labels_train_path) for idx in range(len(seqs))]
# train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
# test_dataset = [prepare_data_point(idx, seqs, protein_data, label_file=None) for idx in range(len(seqs))]

# # Model definition and training...
# model = FullyConvolutionalProteinModel()
# criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0)
# num_epochs = 25

# # Train and Test model on test dataset and create submission file
# train_model(model, criterion, optimizer, train_dataloader, num_epochs)
# test_model_direct(model, test_dataset, output_file='./data/submission.csv')


In [None]:
# def load_protein_data(csv_file, train_dir):
#     """Loads protein data from CSV and directory."""
#     seqs = pd.read_csv(csv_file)
#     protein_data = {}
#     for filename in os.listdir(train_dir):
#         if filename.endswith(".csv"):
#             protein_id = re.split(r'_train|_test', filename)[0]
#             protein_data[protein_id] = pd.read_csv(os.path.join(train_dir, filename))
#     return seqs, protein_data

# def load_labels(label_file):
#     """Loads labels from a CSV file."""
#     if label_file:
#         return pd.read_csv(label_file)
#     return None

In [None]:
# def encode_sequence(sequence):
#     """Encodes a protein sequence using one-hot encoding."""
#     encoded_sequence = np.zeros((len(sequence), len(amino_acid_mapping)), dtype=int)
#     for i, amino_acid in enumerate(sequence):
#         index = amino_acid_mapping.get(amino_acid, amino_acid_mapping['X'])
#         encoded_sequence[i, index] = 1
#     return encoded_sequence

# def normalize_pssm(pssm, normalize_method='min-max'):
#     """Normalizes a PSSM using the specified method."""
#     # Assuming the first two columns are non-numeric; adjust as necessary based on your actual data format
#     numeric_columns = pssm[:, 2:]  # Adjust this if your numeric data starts from a different column

#     # Convert to floats & handle any errors
#     try:
#         pssm_numeric = numeric_columns.astype(np.float32)
#     except ValueError as e:
#         raise ValueError(f"Error converting PSSM to float: {e}")

#     if normalize_method == 'min-max':
#         # Min-Max normalization
#         pssm_min = pssm_numeric.min(axis=0)
#         pssm_max = pssm_numeric.max(axis=0)
#         # Ensure no division by zero
#         pssm_range = np.where(pssm_max - pssm_min == 0, 1, pssm_max - pssm_min)
#         normalized_pssm = (pssm_numeric - pssm_min) / pssm_range
#     elif normalize_method == 'z-score':
#         # Z-Score normalization
#         pssm_mean = pssm_numeric.mean(axis=0)
#         pssm_std = pssm_numeric.std(axis=0)
#         # Avoid division by zero
#         pssm_std = np.where(pssm_std == 0, 1, pssm_std)
#         normalized_pssm = (pssm_numeric - pssm_mean) / pssm_std
#     else:
#         # If no normalization method provided, return the original PSSM
#         normalized_pssm = pssm_numeric

#     return normalized_pssm


# def prepare_data_point(idx, seqs, protein_data, label_file=None):
#     """Prepares a protein sample for training or inference."""
#     labels = load_labels(label_file)
#     protein_id = seqs.iloc[idx]['PDB_ID']
#     sequence = seqs.iloc[idx]['SEQUENCE']
#     encoded_sequence = encode_sequence(sequence)  # Encode the sequence
#     pssm = protein_data[protein_id].values  # Assuming you will process PSSM separately
#     normalized_pssm = normalize_pssm(pssm)  # Ensure this is uncommented to use normalized PSSM

#     if labels is not None:
#         label_seq = labels.iloc[idx]['SEC_STRUCT']
#         label_numeric = [sec_struct_mapping[char] for char in label_seq]
#         label_tensor = torch.tensor(label_numeric, dtype=torch.long)
#         return (
#             protein_id,
#             torch.tensor(encoded_sequence, dtype=torch.float32),
#             torch.tensor(normalized_pssm, dtype=torch.float32),
#             label_tensor
#         )

#     return (
#         protein_id,
#         torch.tensor(encoded_sequence, dtype=torch.float32),
#         torch.tensor(normalized_pssm, dtype=torch.float32)
#     )

In [None]:
# def encode_sequence(sequence, amino_acid_mapping):
#     # Convert each amino acid in the sequence to a one-hot encoded vector
#     encoded_sequence = np.zeros((len(sequence), len(amino_acid_mapping)), dtype=int)
#     for i, amino_acid in enumerate(sequence):
#         # Default to 'X' for unknown amino acids
#         index = amino_acid_mapping.get(amino_acid, amino_acid_mapping['X'])
#         encoded_sequence[i, index] = 1
#     return encoded_sequence

# def normalize_pssm(pssm, normalize_method='min-max'):
#     # Assuming the first two columns are non-numeric; adjust as necessary based on your actual data format
#     numeric_columns = pssm[:, 2:]  # Adjust this if your numeric data starts from a different column

#     # Convert to floats
#     try:
#         pssm_numeric = numeric_columns.astype(np.float32)
#     except ValueError as e:
#         # Handle or log the error if needed
#         raise ValueError(f"Error converting PSSM to float: {e}")

#     if normalize_method == 'min-max':
#         # Min-Max normalization
#         pssm_min = pssm_numeric.min(axis=0)
#         pssm_max = pssm_numeric.max(axis=0)
#         # Ensure no division by zero
#         pssm_range = np.where(pssm_max - pssm_min == 0, 1, pssm_max - pssm_min)
#         normalized_pssm = (pssm_numeric - pssm_min) / pssm_range
#     elif normalize_method == 'z-score':
#         # Z-Score normalization
#         pssm_mean = pssm_numeric.mean(axis=0)
#         pssm_std = pssm_numeric.std(axis=0)
#         # Avoid division by zero
#         pssm_std = np.where(pssm_std == 0, 1, pssm_std)
#         normalized_pssm = (pssm_numeric - pssm_mean) / pssm_std
#     else:
#         # If no normalization method provided, return the original PSSM
#         normalized_pssm = pssm_numeric

#     return normalized_pssm

# def protein_dataset(csv_file, train_dir, label_file=None, normalize_method='min-max'):
#     # Load the sequences
#     seqs = pd.read_csv(csv_file)

#     # Load the protein data from the directory
#     protein_data = {}
#     for filename in os.listdir(train_dir):
#         if filename.endswith(".csv"):  # Check if the file is a CSV
#             protein_id = re.split(r'_train|_test', filename)[0]
#             protein_data[protein_id] = pd.read_csv(os.path.join(train_dir, filename))

#     # Load the labels, if provided
#     if label_file:
#         labels = pd.read_csv(label_file)
#     else:
#         labels = None

#     # Amino acid mapping
#     amino_acid_mapping = {
#         'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
#         'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
#         'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
#         'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
#         'X': 20,  # Typically used for unknown amino acids
#         'B': 21,  # Asparagine or Aspartic acid
#         'Z': 22,  # Glutamine or Glutamic acid
#         'J': 23,  # Leucine or Isoleucine
#         '-': 24,  # Gap or padding
#     }

# def get_item(idx):
#     protein_id = seqs.iloc[idx]['PDB_ID']
#     sequence = seqs.iloc[idx]['SEQUENCE']
#     encoded_sequence = encode_sequence(sequence, amino_acid_mapping)  # Encode the sequence
#     pssm = protein_data[protein_id].values  # Assuming you will process PSSM separately
#     normalized_pssm = normalize_pssm(pssm, normalize_method)  # Ensure this is uncommented to use normalized PSSM

#     if labels is not None:
#         label_seq = labels.iloc[idx]['SEC_STRUCT']
#         label_numeric = [sec_struct_mapping[char] for char in label_seq]
#         label_tensor = torch.tensor(label_numeric, dtype=torch.long)
#         return (
#             protein_id,
#             torch.tensor(encoded_sequence, dtype=torch.float32),
#             torch.tensor(normalized_pssm, dtype=torch.float32),
#             label_tensor
#         )

#     return (
#         protein_id,
#         torch.tensor(encoded_sequence, dtype=torch.float32),
#         torch.tensor(normalized_pssm, dtype=torch.float32)
#     )



In [None]:
# def get_optimizer(optimizer_type, model, lr, weight_decay):
#     # Choose the optimizer based on the parameterization
#     if optimizer_type == "adam":
#         optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
#     elif optimizer_type == "sgd":
#         optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
#     elif optimizer_type == "rmsprop":
#         optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
#     else:
#         raise ValueError("Unknown optimizer")

#     return optimizer


# def train_with_params(
#         lr=0.001,
#         batch_size=4,
#         hidden_layers=5,
#         dropout_rate=0.233246,
#         weight_decay=0.0,
#         optimizer='rmsprop',
#         normalization='min-max',
#         num_epochs=10,
#         output_file='submission.csv'
# ):
#     train_dataset = ProteinDataset(csv_file=seqs_train_path, train_dir=train_path, label_file=labels_train_path,
#                                    normalize_method=normalization)
#     train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

#     test_dataset = ProteinDataset(csv_file=seqs_test_path, train_dir=test_path, normalize_method=normalization)

#     # Splitting train_dataset into train and validation sets (adjust sizes as needed)
#     train_size = int(0.8 * len(train_dataset))
#     val_size = len(train_dataset) - train_size
#     train_subset, val_subset = random_split(train_dataset, [train_size, val_size])
#     val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

#     model = FullyConvolutionalProteinModel(hidden_layers_number=hidden_layers, dropout_rate=dropout_rate)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = get_optimizer(optimizer, model, lr, weight_decay)

#     train_model(model, criterion, optimizer, train_dataloader, num_epochs)
#     validate_model(model, criterion, val_loader)
#     test_model_direct(model, test_dataset, output_file)


In [None]:
# # Load training data
# seqs_train, protein_data_train = load_protein_data(seqs_train_path, train_path)
# labels_train = load_labels(labels_train_path)

# # Prepare training samples (assuming collate_fn handles batching and shuffling)
# train_samples = []
# for idx in range(len(seqs_train)):
#     protein_id = seqs_train.iloc[idx]['PDB_ID']
#     sequence = seqs_train.iloc[idx]['SEQUENCE']
#     pssm = protein_data_train[protein_id].values
#     sample = prepare_protein_sample(
#         protein_id, sequence, pssm, labels_train and labels_train.iloc[idx],  # Include label if exists
#         amino_acid_mapping, normalize_method, sec_struct_mapping
#     )
#     train_samples.append(sample)

# # Load testing data
# seqs_test, protein_data_test = load_protein_data(seqs_test_path, test_path)

# # Prepare testing samples (assuming collate_fn handles batching)
# test_samples = []
# for idx in range(len(seqs_test)):
#     protein_id = seqs_test.iloc[idx]['PDB_ID']
#     sequence = seqs_test.iloc[idx]['SEQUENCE']
#     pssm = protein_data_test[protein_id].values
#     sample = prepare_protein_sample(
#         protein_id, sequence, pssm, labels_train.empty and None or labels_train.iloc[idx],  # Include label if exists
#         amino_acid_mapping, normalize_method, sec_struct_mapping
#     )

#     test_samples.append(sample)

# # Create dataloaders (assuming collate_fn remains the same)
# train_dataloader = DataLoader(train_samples, batch_size=4, collate_fn=collate_fn)
# test_dataloader = DataLoader(test_samples, batch_size=4, collate_fn=collate_fn)


In [None]:
# seqs, protein_data = load_protein_data(seqs_train_path, train_path)
# train_dataset = [prepare_data_point(idx, seqs, protein_data, labels_train_path) for idx in range(len(seqs))]
# train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
# test_dataset = [prepare_data_point(idx, seqs, protein_data, label_file=None) for idx in range(len(seqs))]

# # Model definition and training...
# model = FullyConvolutionalProteinModel()
# criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0)
# num_epochs = 25

# # Train and Test model on test dataset and create submission file
# train_model(model, criterion, optimizer, train_dataloader, num_epochs)
# test_model_direct(model, test_dataset, output_file='./data/submission.csv')


# until here

In [None]:
# import os
# import re

# import numpy as np
# import pandas as pd
# import torch
# from torch.utils.data import Dataset, DataLoader
# from torch.nn.utils.rnn import pad_sequence

# # Define store file paths
# DATA_PATH = "./data/"
# labels_train_path = DATA_PATH + "labels_train.csv"
# sample_path = DATA_PATH + "sample.csv"
# seqs_test_path = DATA_PATH + "seqs_test.csv"
# seqs_train_path = DATA_PATH + "seqs_train.csv"
# train_path = DATA_PATH + "train"
# test_path = DATA_PATH + "test"

# # Define a mapping from amino acid characters to integers
# amino_acid_mapping = {
#     'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
#     'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
#     'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
#     'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
#     'X': 20,  # Typically used for unknown amino acids
#     'B': 21,  # Asparagine or Aspartic acid
#     'Z': 22,  # Glutamine or Glutamic acid
#     'J': 23,  # Leucine or Isoleucine
#     '-': 24,  # Gap or padding
# }

# sec_struct_mapping = {'H': 0, 'E': 1, 'C': 2}  # Add more mappings if there are more labels


# def load_data(csv_file, train_dir):
#     # Load the sequences
#     seqs = pd.read_csv(csv_file)

#     # Load the protein data from the directory
#     protein_data = {}
#     for filename in os.listdir(train_dir):
#         if filename.endswith(".csv"):  # Check if the file is a CSV
#             protein_id = re.split(r'_train|_test', filename)[0]
#             protein_data[protein_id] = pd.read_csv(os.path.join(train_dir, filename))

#     return seqs, protein_data


# def encode_sequence(sequence):
#     # Convert each amino acid in the sequence to a one-hot encoded vector
#     encoded_sequence = np.zeros((len(sequence), len(amino_acid_mapping)), dtype=int)
#     for i, amino_acid in enumerate(sequence):
#         # Default to 'X' for unknown amino acids
#         index = amino_acid_mapping.get(amino_acid, amino_acid_mapping['X'])
#         encoded_sequence[i, index] = 1
#     return encoded_sequence


# def normalize_pssm(pssm, normalize_method='min-max'):
#     # Assuming the first two columns are non-numeric; adjust as necessary based on your actual data format
#     numeric_columns = pssm[:, 2:]  # Adjust this if your numeric data starts from a different column

#     # Convert to floats
#     try:
#         pssm_numeric = numeric_columns.astype(np.float32)
#     except ValueError as e:
#         # Handle or log the error if needed
#         raise ValueError(f"Error converting PSSM to float: {e}")

#     if normalize_method == 'min-max':
#         # Min-Max normalization
#         pssm_min = pssm_numeric.min(axis=0)
#         pssm_max = pssm_numeric.max(axis=0)
#         # Ensure no division by zero
#         pssm_range = np.where(pssm_max - pssm_min == 0, 1, pssm_max - pssm_min)
#         normalized_pssm = (pssm_numeric - pssm_min) / pssm_range
#     elif normalize_method == 'z-score':
#         # Z-Score normalization
#         pssm_mean = pssm_numeric.mean(axis=0)
#         pssm_std = pssm_numeric.std(axis=0)
#         # Avoid division by zero
#         pssm_std = np.where(pssm_std == 0, 1, pssm_std)
#         normalized_pssm = (pssm_numeric - pssm_mean) / pssm_std
#     else:
#         # If no normalization method provided, return the original PSSM
#         normalized_pssm = pssm_numeric

#     return normalized_pssm


# def prepare_data_point(idx, seqs, protein_data, label_file=None):
#     # Load the labels, if provided
#     if label_file:
#         labels = pd.read_csv(label_file)
#     else:
#         labels = None

#     protein_id = seqs.iloc[idx]['PDB_ID']
#     sequence = seqs.iloc[idx]['SEQUENCE']
#     encoded_sequence = encode_sequence(sequence)  # Encode the sequence
#     pssm = protein_data[protein_id].values  # Assuming you will process PSSM separately
#     normalized_pssm = normalize_pssm(pssm)  # Ensure this is uncommented to use normalized PSSM

#     if labels is not None:
#         label_seq = labels.iloc[idx]['SEC_STRUCT']
#         label_numeric = [sec_struct_mapping[char] for char in label_seq]
#         label_tensor = torch.tensor(label_numeric, dtype=torch.long)
#         return (
#             protein_id,
#             torch.tensor(encoded_sequence, dtype=torch.float32),
#             torch.tensor(normalized_pssm, dtype=torch.float32),
#             label_tensor
#         )

#     return (
#         protein_id,
#         torch.tensor(encoded_sequence, dtype=torch.float32),
#         torch.tensor(normalized_pssm, dtype=torch.float32)
#     )


# def create_dataloader(csv_file, train_dir, label_file=None, batch_size=4):
#     seqs, protein_data = load_data(csv_file, train_dir)
#     data = [prepare_data_point(idx, seqs, protein_data, label_file) for idx in range(len(seqs))]
#     return DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# def collate_fn(batch):
#     _, sequences, pssms, labels_list = zip(*batch)  # Unzip the batch

#     # Pad sequences and PSSMs
#     sequences_padded = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)

#     pssms_padded = pad_sequence([pssm.clone().detach() for pssm in pssms], batch_first=True)

#     # Handling labels correctly
#     if labels_list[0] is not None:  # Check if labels exist
#         labels_padded = pad_sequence([label.clone().detach() for label in labels_list], batch_first=True)

#     else:
#         labels_padded = None

#     # Create a mask based on the original sequence lengths
#     mask = [torch.ones(len(label), dtype=torch.uint8) for label in labels_list]
#     mask_padded = pad_sequence(mask, batch_first=True, padding_value=0)  # Assuming padding_value for labels is 0
#     return sequences_padded, pssms_padded, labels_padded, mask_padded


# class FullyConvolutionalProteinModel(torch.nn.Module):
#     def __init__(self, num_classes=3, input_channels=20):  # 20 for amino acid one-hot, adjust if using PSSM
#         super(FullyConvolutionalProteinModel, self).__init__()

#         # Define convolutional layers
#         self.conv1 = torch.nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding=1)
#         self.conv2 = torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
#         self.conv3 = torch.nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)

#         # Final layer that maps to the number of classes
#         self.final_conv = torch.nn.Conv1d(in_channels=256, out_channels=num_classes, kernel_size=1)

#     def forward(self, x):
#         # Apply convolutional layers with activation functions
#         x = torch.nn.functional.relu(self.conv1(x))
#         x = torch.nn.functional.relu(self.conv2(x))
#         x = torch.nn.functional.relu(self.conv3(x))

#         # Apply final convolutional layer - no activation, as CrossEntropyLoss includes it
#         x = self.final_conv(x)

#         # No softmax here, as nn.CrossEntropyLoss applies it internally.
#         # Transpose the output to match [batch_size, sequence_length, num_classes]
#         # This makes it easier to calculate loss later
#         x = x.transpose(1, 2)

#         return x


# def train_model(model, criterion, optimizer, train_dataloader, num_epochs=10):
#     for epoch in range(num_epochs):
#         model.train()  # Set model to training mode
#         running_loss = 0.0
#         correct_preds = 0
#         total_preds = 0

#         for sequences, pssms, labels, _ in train_dataloader:
#             inputs = pssms.permute(0, 2, 1)  # Adjust for PSSM data

#             optimizer.zero_grad()

#             outputs = model(inputs)
#             loss = criterion(outputs.transpose(1, 2), labels)

#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item() * inputs.size(0)

#             # Calculate training accuracy
#             _, predicted = torch.max(outputs, 2)  # Get the index of the max log-probability
#             correct_preds += (predicted == labels).sum().item()
#             total_preds += labels.numel()

#         epoch_loss = running_loss / len(train_dataloader.dataset)
#         epoch_acc = correct_preds / total_preds
#         print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')


# def test_model_direct(model, test_dataset, output_file='./submission.csv'):
#     model.eval()  # Set the model to evaluation mode
#     predictions = []

#     with torch.no_grad():
#         for i in range(len(test_dataset)):  # Iterate directly over the dataset
#             pdb_id, _, pssm = test_dataset[i]  # Assuming the dataset returns PDB_ID, sequence, and PSSM

#             # Prepare the input tensor; add an extra batch dimension using unsqueeze
#             input_pssm = pssm.unsqueeze(0).permute(0, 2, 1)  # Adjust dimensions to [1, features, seq_len]

#             # Make a prediction
#             outputs = model(input_pssm)
#             _, predicted = torch.max(outputs, 2)  # Get the index of max log-probability

#             # Process the predictions
#             seq_len = pssm.shape[0]  # Assuming pssm is [features, seq_len]
#             for j in range(seq_len):
#                 residue_id = f"{pdb_id}_{j + 1}"  # Construct the ID
#                 structure_label = ['H', 'E', 'C'][predicted[0, j].item()]  # Map numeric predictions to labels
#                 predictions.append([residue_id, structure_label])

#     # Write predictions to CSV
#     pd.DataFrame(predictions, columns=['ID', 'STRUCTURE']).to_csv(output_file, index=False)
#     print(f'Submission file saved to {output_file}')


# if __name__ == "__main__":
#     train_dataloader = create_dataloader(seqs_train_path, train_path, label_file=labels_train_path, batch_size=4)

#     # Model definition and training...
#     model = FullyConvolutionalProteinModel()
#     criterion = torch.nn.CrossEntropyLoss()
#     optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0)
#     num_epochs = 10

#     train_model(model, criterion, optimizer, train_dataloader)

#     # Test model on test dataset and create submission file
#     test_dataset = create_dataloader(seqs_test_path, test_path)
#     test_model_direct(model, test_dataset)


RuntimeError: stack expects each tensor to be equal size, but got [307, 25] at entry 0 and [142, 25] at entry 1