In [None]:
!pip install evaluate
!pip install transformers
!pip install datasets
!pip install optuna

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import evaluate
import optuna

In [None]:
# Load the PSS dataset
pss_data = pd.read_csv('/content/subset_file_50000_100000.csv')

# Display first few rows of the dataset
pss_data.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa,Exptl.,resolution,R-factor,FreeRvalue
0,1FV1,F,NPVVHFFKNIVTPRTPPPSQ,CCCCCBCCCCCCCCCCCCCC,CCCCCECCCCCCCCCCCCCC,20,False,XRAY,1.9,0.23,0.27
1,1LM8,H,DLDLEMLAPYIPMDDDFQLR,CCCCCCCCCBCCSCCCEECC,CCCCCCCCCECCCCCCEECC,20,False,XRAY,1.85,0.2,0.24
2,1O06,A,EEDPDLKAAIQESLREAEEA,CCCHHHHHHHHHHHHHHHTC,CCCHHHHHHHHHHHHHHHCC,20,False,XRAY,1.45,0.19,0.22
3,1QOW,D,CTFTLPGGGGVCTLTSECI*,CCTTSCTTCSSTTSSTTCCC,CCCCCCCCCCCCCCCCCCCC,20,True,XRAY,1.06,0.14,1.0
4,1RDQ,I,TTYADFIASGRTGRRNAIHD,CHHHHHHTSSCSSCCCCEEC,CHHHHHHCCCCCCCCCCEEC,20,False,XRAY,1.26,0.13,0.16


In [None]:
# Data augmention by oversampling the minority class
# Separate the dataset into different classes
coil_data = pss_data[pss_data['sst3'] == 'C']
sheet_data = pss_data[pss_data['sst3'] == 'E']
helix_data = pss_data[pss_data['sst3'] == 'H']

# Determine the maximum count among the classes (the majority class)
max_class_size = max(coil_data.shape[0], sheet_data.shape[0], helix_data.shape[0])

# Oversample the minority classes to match the size of the majority class
# Check if the dataframes are not empty before sampling
sheet_data_oversampled = sheet_data.sample(max_class_size, replace=True, random_state=42) if not sheet_data.empty else pd.DataFrame(columns=sheet_data.columns)
helix_data_oversampled = helix_data.sample(max_class_size, replace=True, random_state=42) if not helix_data.empty else pd.DataFrame(columns=helix_data.columns)

# Combine the data back into one balanced dataset
balanced_data = pd.concat([coil_data, sheet_data_oversampled, helix_data_oversampled])

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the new class distribution
balanced_distribution = balanced_data['sst3'].value_counts()
print(balanced_distribution)

Series([], Name: count, dtype: int64)


In [None]:
# Prepare data for training
# Remove rows with NaN in 'sst3' column
pss_data_cleaned = pss_data.dropna(subset=['sst3'])

X = pss_data_cleaned['seq']  # Amino acid sequence
y = pss_data_cleaned['sst3']  # Secondary structure labels (C, E, H)

# Split the dataset into training, validation, and test sets
# Do not stratify on the full sequence as each sequence is unique
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)

In [None]:
# Modify the model to use ESM2
model_name = "facebook/esm2_t30_150M_UR50D"  # ESM2 model

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors="pt"  # PyTorch tensors
    )

# Tokenize the datasets
train_encodings = tokenize_data(train_texts, tokenizer)
val_encodings = tokenize_data(val_texts, tokenizer)
test_encodings = tokenize_data(test_texts, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
from datasets import Dataset
import torch

# Ensure labels are correctly mapped to integers
label_map = {'C': 0, 'E': 1, 'H': 2} # Helix(H), Sheet(E), Coil(C)

# Convert string labels (sequences) to a list of numeric labels for each residue and pad/truncate them
def prepare_labels(label_sequences, label_map, max_length, pad_value=-100):
    numeric_labels = []
    for seq in label_sequences:
        numeric_seq = [label_map.get(char, pad_value) for char in seq] # Use .get with pad_value for unknown chars
        # Truncate numeric sequence if longer than max_length
        truncated_seq = numeric_seq[:max_length]
        # Pad the numeric sequence
        padded_seq = truncated_seq + [pad_value] * (max_length - len(truncated_seq))
        numeric_labels.append(padded_seq)
    return numeric_labels

# Get max_length from tokenization for each split
train_max_length = train_encodings["input_ids"].shape[1]
val_max_length = val_encodings["input_ids"].shape[1]
test_max_length = test_encodings["input_ids"].shape[1]


train_labels_numeric_padded = prepare_labels(train_labels, label_map, train_max_length)
val_labels_numeric_padded = prepare_labels(val_labels, label_map, val_max_length)
test_labels_numeric_padded = prepare_labels(test_labels, label_map, test_max_length)


# Convert the data into Hugging Face Dataset format with torch tensors
train_data = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"].clone().detach(),
    "attention_mask": train_encodings["attention_mask"].clone().detach(),
    "labels": torch.tensor(train_labels_numeric_padded, dtype=torch.long) # Use the padded numeric labels
})

val_data = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"].clone().detach(),
    "attention_mask": val_encodings["attention_mask"].clone().detach(),
    "labels": torch.tensor(val_labels_numeric_padded, dtype=torch.long) # Use the padded numeric labels
})

test_data = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"].clone().detach(),
    "attention_mask": test_encodings["attention_mask"].clone().detach(),
    "labels": torch.tensor(test_labels_numeric_padded, dtype=torch.long) # Use the padded numeric labels
})

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments # Added import here

# Function to define the model and training arguments for Optuna optimization
def create_model(params):
    # Create an ESM2 model for Token Classification
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=3  # 3 classes (C, E, H)
    )

    # Training arguments with hyperparameters to tune
    training_args = TrainingArguments(
        output_dir=f"./esm2_token_output", # Changed output directory name
        eval_strategy="epoch",
        learning_rate=params['learning_rate'],
        per_device_train_batch_size=params['batch_size'],
        per_device_eval_batch_size=32,
        num_train_epochs=params['epochs'],
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1", # Still using F1 as a metric
        greater_is_better=True,
        report_to="none",
    )
    return model, training_args

In [None]:
# Load evaluation metrics
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
accuracy_metric = evaluate.load("accuracy")

# Define the compute_metrics function to calculate accuracy, precision, recall, F1, and Q3 accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Flatten the predictions and labels and remove padding (label -100)
    flat_predictions = predictions.flatten()
    flat_labels = labels.flatten()

    # Filter out padding
    mask = flat_labels != -100
    filtered_predictions = flat_predictions[mask]
    filtered_labels = flat_labels[mask]

    # Compute Accuracy, Precision, Recall, F1, and Macro F1 on filtered data
    accuracy = accuracy_metric.compute(predictions=filtered_predictions, references=filtered_labels)
    precision = precision_metric.compute(predictions=filtered_predictions, references=filtered_labels, average="weighted")
    recall = recall_metric.compute(predictions=filtered_predictions, references=filtered_labels, average="weighted")
    f1 = f1_metric.compute(predictions=filtered_predictions, references=filtered_labels, average="weighted")

    # Calculate Q3 accuracy (percentage of correctly predicted residues in the sequence)
    # Q3 accuracy is calculated on all residues excluding padding
    q3_correct = sum(filtered_predictions == filtered_labels)
    q3_accuracy = q3_correct / len(filtered_labels) if len(filtered_labels) > 0 else 0.0


    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
        "q3_accuracy": q3_accuracy
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Function for hyperparameter optimization
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 6e-5),
        'batch_size': trial.suggest_int('batch_size', 8, 32),
        'epochs': trial.suggest_int('epochs', 3, 10)
    }

    # Create model and training args with hyperparameters
    model, training_args = create_model(params)

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping
    )

    # Train the model
    trainer.train()

    # Evaluate the model using the Trainer's evaluation method which uses compute_metrics
    eval_results = trainer.evaluate(val_data)

    # Get the F1 score from the evaluation results
    f1 = eval_results["eval_f1"] # Access the f1 metric calculated by compute_metrics

    return f1

In [None]:
# Set up Optuna study and perform hyperparameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)

# Output the best hyperparameters
print("Best hyperparameters: ", study.best_trial.params)

[I 2025-07-04 08:38:00,644] A new study created in memory with name: no-name-1bfbd122-2ff1-48b5-9e80-5c57417a1338
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 6e-5),
Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t30_150M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Q3 Accuracy
1,0.4083,0.42399,0.83119,0.833083,0.83119,0.831244,0.83119
2,0.3732,0.415591,0.835477,0.836709,0.835477,0.835662,0.835477
3,0.2653,0.434204,0.834041,0.835558,0.834041,0.834054,0.834041
4,0.2401,0.464562,0.831067,0.831618,0.831067,0.831148,0.831067


[I 2025-07-04 09:04:15,197] Trial 0 finished with value: 0.8356616384613541 and parameters: {'learning_rate': 4.574671250364365e-05, 'batch_size': 20, 'epochs': 5}. Best is trial 0 with value: 0.8356616384613541.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 6e-5),
Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t30_150M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Q3 Accuracy
1,0.4114,0.431453,0.828481,0.829871,0.828481,0.828626,0.828481
2,0.3666,0.418253,0.833796,0.83506,0.833796,0.833855,0.833796
3,0.3282,0.437556,0.832701,0.834896,0.832701,0.83277,0.832701
4,0.2488,0.465107,0.829576,0.829761,0.829576,0.829543,0.829576


[I 2025-07-04 09:30:50,473] Trial 1 finished with value: 0.8338550784734576 and parameters: {'learning_rate': 4.020880533239521e-05, 'batch_size': 23, 'epochs': 8}. Best is trial 0 with value: 0.8356616384613541.


Best hyperparameters:  {'learning_rate': 4.574671250364365e-05, 'batch_size': 20, 'epochs': 5}


In [None]:
# Now train the model with the best hyperparameters from Optuna
best_params = study.best_trial.params

# Create the model and training arguments directly using the best hyperparameters
model, training_args = create_model(best_params)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the final model with the best hyperparameters
trainer.train()

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t30_150M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Q3 Accuracy
1,0.4083,0.42399,0.83119,0.833083,0.83119,0.831244,0.83119
2,0.3732,0.415591,0.835477,0.836709,0.835477,0.835662,0.835477
3,0.2653,0.434204,0.834041,0.835558,0.834041,0.834054,0.834041
4,0.2401,0.464562,0.831067,0.831618,0.831067,0.831148,0.831067
5,0.1712,0.501419,0.827027,0.827842,0.827027,0.827088,0.827027


TrainOutput(global_step=1820, training_loss=0.31430873379602536, metrics={'train_runtime': 2160.5799, 'train_samples_per_second': 16.806, 'train_steps_per_second': 0.842, 'total_flos': 4119036256696320.0, 'train_loss': 0.31430873379602536, 'epoch': 5.0})

In [None]:
from sklearn.metrics import classification_report

# Evaluate the model on the test data using the trainer's evaluate method
eval_results = trainer.evaluate(test_data)

# The eval_results dictionary contains the metrics calculated by compute_metrics, including Q3 accuracy and f1
q3_accuracy = eval_results["eval_q3_accuracy"]
f1_score = eval_results["eval_f1"]

# To get the classification report, we need the flattened, non-padded true and predicted labels from the test set
# We can get the predictions and labels using trainer.predict and then filter
predictions = trainer.predict(test_data)
predicted_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Flatten and filter out padding (-100)
flat_predicted_labels = predicted_labels.flatten()
flat_true_labels = true_labels.flatten()

mask = flat_true_labels != -100
filtered_predicted_labels = flat_predicted_labels[mask]
filtered_true_labels = flat_true_labels[mask]


# Print Q3 accuracy and F1 score
print("Q3 Accuracy: {:.4f}".format(q3_accuracy))
print("Weighted F1 Score: {:.4f}".format(f1_score))


# Print classification report
# Define target names based on your label_map
target_names = ["Coil", "Sheet", "Helix"] # Assuming 0: Coil, 1: Sheet, 2: Helix
# Explicitly specify the labels to report on
report_labels = [0, 1, 2] # Corresponding to Coil, Sheet, Helix

print("\nClassification Report:")
print(classification_report(filtered_true_labels, filtered_predicted_labels, labels=report_labels, target_names=target_names))

Q3 Accuracy: 0.8300
Weighted F1 Score: 0.8300

Classification Report:
              precision    recall  f1-score   support

        Coil       0.81      0.84      0.83     47087
       Sheet       0.82      0.79      0.81     24198
       Helix       0.86      0.84      0.85     36067

    accuracy                           0.83    107352
   macro avg       0.83      0.82      0.83    107352
weighted avg       0.83      0.83      0.83    107352



In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define the path where you want to save the model in your Google Drive
# Make sure this directory exists or is created.
save_path = "/content/drive/MyDrive/esm2large_protein_secondary_structure_model"

# Save the model and tokenizer using the trainer
trainer.save_model(save_path)

print(f"Model saved to {save_path}")

Model saved to /content/drive/MyDrive/esm2large_protein_secondary_structure_model
