In [1]:
!pip install unsloth peft evaluate trl transformers datasets accelerate xformers
!pip install torch transformers peft bitsandbytes accelerate datasets

[0m

In [1]:
import os
import torch
import evaluate
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType

In [2]:
#Use GPU else specify '-1' for CPU
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
#Load training data
data=pd.read_csv('Hydroxylation (P)/training.csv')
data
#Check the positive and negative labels
data['Label'].value_counts()
#Remove \n and - characters from the sequence
data['Seq']=data['Seq'].str.replace('-','')
data['Seq']=data['Seq'].str.replace('\n','')
data
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')
#Add custom tokens
tokenizer.add_tokens(['SEQUENCE:','LABEL:','POSITIVE','NEGATIVE'])
tokenizer.special_tokens_map
#Map positive/negative labels and prepare prompt for training
class SequenceClassificationDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer,dtype='Train'):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={1:'POSITIVE',0:'NEGATIVE'}
        self.dtype='Train'
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        prep_txt1= f'<startoftext>SEQUENCE:{sequence}\nLABEL:{self.map_label[label]}<endoftext>'
        encoding1 = self.tokenizer(prep_txt1,return_tensors='pt')
        return {
            'input_ids': encoding1['input_ids'].squeeze(), 
            'attention_mask': encoding1['attention_mask'].squeeze(), 
            'labels': encoding1['input_ids'].squeeze()
        }
train_texts=data['Seq'].reset_index(drop=True)
train_labels=data['Label'].reset_index(drop=True)
train_dataset=SequenceClassificationDataset(train_texts,train_labels,tokenizer,'Train')

config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/655k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/314k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [3]:
# Load the pre-trained model
model_config = GPT2Config.from_pretrained('nferruz/ProtGPT2')
model = GPT2LMHeadModel.from_pretrained('nferruz/ProtGPT2', config=model_config, ignore_mismatched_sizes=True)

# Resize token embeddings to include new tokens
model.resize_token_embeddings(len(tokenizer))

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   # GPT-2 is a causal LM
    inference_mode=False,           # Training mode
    r=8,                            # Rank of the low-rank matrix
    lora_alpha=16,                  # Scaling factor
    lora_dropout=0.1,               # Dropout for regularization
    bias="none",                    # Train only LoRA weights, not biases
    target_modules=["lm_head"]
)

# Apply PEFT (LoRA) to the model
model = get_peft_model(model, peft_config)

# Optional: Print trainable parameters to verify PEFT is applied
model.print_trainable_parameters()

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 412,352 || all params: 774,451,392 || trainable%: 0.0532




In [7]:
training_args = TrainingArguments(
    output_dir="/teamspace/studios/this_studio/results_checkpoints2",
    #num_train_epochs=200,
    num_train_epochs=800,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    warmup_steps=500,
    #warmup_steps=50,
    weight_decay=0.01,
    logging_dir='logs/',
    save_steps=500,
    #save_steps=50,
    logging_steps=500,
    #logging_steps=50,
    save_total_limit=10,
    gradient_checkpointing=True,
    report_to="none",
    learning_rate=1e-03,
    #max_steps=200
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding='longest'),
    #label_names=[]  # This will now work
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()



Step,Training Loss
500,4.3221
1000,3.5327
1500,3.2952
2000,3.1739
2500,3.1088
3000,3.0667
3500,3.0404
4000,3.0213
4500,3.0068
5000,2.9963




TrainOutput(global_step=6600, training_loss=3.1901889407995974, metrics={'train_runtime': 1178.617, 'train_samples_per_second': 892.232, 'train_steps_per_second': 5.6, 'total_flos': 6.766891107714662e+16, 'train_loss': 3.1901889407995974, 'epoch': 200.0})

In [6]:
trainer.train(resume_from_checkpoint="/teamspace/studios/this_studio/results_checkpoints2/checkpoint-6600")

	per_device_train_batch_size: 256 (from args) != 160 (from trainer_state.json)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
7000,2.9863
7500,2.975
8000,2.9646
8500,2.9566
9000,2.9479
9500,2.9426
10000,2.9363
10500,2.9322
11000,2.9264
11500,2.9215




KeyboardInterrupt: 

In [10]:
trainer.train(resume_from_checkpoint="/teamspace/studios/this_studio/results_checkpoints2/checkpoint-13500")

	per_device_train_batch_size: 512 (from args) != 160 (from trainer_state.json)




Step,Training Loss
14000,2.906
14500,2.9048
15000,2.9026
15500,2.9006
16000,2.8993
16500,2.8974
17000,2.8952
17500,2.8929
18000,2.8914
18500,2.8906




KeyboardInterrupt: 

In [None]:
# Install required packages (run once)
#!pip install transformers peft pandas scikit-learn torch

import torch
import pandas as pd
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
from peft import PeftModel
from sklearn.metrics import (
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score
)
#from google.colab import files

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üöÄ Using device: {device}")
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

# Path to your trained model
model_path = "/teamspace/studios/this_studio/results_checkpoints2/checkpoint-24000"

# Load base model config with weight tying disabled
config = GPT2Config.from_pretrained("nferruz/ProtGPT2", tie_word_embeddings=False)

# Load base model and tokenizer
base_model_name = "nferruz/ProtGPT2"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    bos_token='<startoftext>',
    eos_token='<endoftext>',
    pad_token='<PAD>'
)

# Add special tokens used during training
special_tokens = ['SEQUENCE:', 'LABEL:', 'POSITIVE', 'NEGATIVE']
tokenizer.add_tokens(special_tokens)

# Load base model
base_model = GPT2LMHeadModel.from_pretrained(base_model_name, config=config)
base_model.resize_token_embeddings(len(tokenizer))

# Apply PEFT (LoRA) configuration and load trained adapter
model = PeftModel.from_pretrained(base_model, model_path)
model = model.to(device)
model.eval()  # Set model to evaluation mode

# Debug info
print("\nüîç Tokenizer Info:")
print("Special tokens:", tokenizer.special_tokens_map)
print("Vocabulary size:", len(tokenizer))
print("Model embeddings:", model.get_input_embeddings().weight.shape)

# Track unknown predictions
unknown_count = 0

def preprocess_sequence(seq):
    """Preprocess the input sequence as done during training."""
    return seq.replace('-', '').replace('\n', '')

def predict_label(input_sequence):
    """
    Predict whether the given protein sequence is POSITIVE or NEGATIVE.
    
    Args:
        input_sequence (str): Raw protein sequence string.
        
    Returns:
        str: Predicted label ('POSITIVE' or 'NEGATIVE')
    """
    global unknown_count
    
    # Clean and format input
    clean_seq = preprocess_sequence(input_sequence)
    prompt = f"<startoftext>SEQUENCE:{clean_seq}\nLABEL:"

    # Tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=20,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            #temperature=0.7,  # Add randomness for better exploration
            top_k=50,         # Limit to top 50 tokens
            do_sample=True    # Enable sampling
        )

    # Decode generated output
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract the predicted label from the generated text
    try:
        # Get everything after "LABEL:"
        label_part = full_output.split("LABEL:")[-1].strip()
        
        # Remove special tokens like <endoftext> and extra whitespace
        label_part = label_part.replace("<endoftext>", "").strip()
        
        # Take only the first word (label)
        predicted_tokens = label_part.split()
        predicted_label = None
        
        # Strict match for POSITIVE/NEGATIVE
        for token in predicted_tokens:
            if token == "POSITIVE":
                predicted_label = "POSITIVE"
                break
            elif token == "NEGATIVE":
                predicted_label = "NEGATIVE"
                break
        
        if predicted_label is None:
            predicted_label = "NEGATIVE"
            unknown_count += 1
            
    except Exception as e:
        predicted_label = "NEGATIVE"
        unknown_count += 1

    return predicted_label

# --- Benchmarking Code ---

print("\nüìÅ Upload your test CSV file (must contain 'Seq' and 'Label' columns):")
#uploaded = files.upload()
#test_file = next(iter(uploaded))

# Load test data
test_df = pd.read_csv("/teamspace/studios/this_studio/Hydroxylation (P)/benchmark.csv")

# Validate required columns
if 'Seq' not in test_df.columns or 'Label' not in test_df.columns:
    raise ValueError("Test CSV must contain 'Seq' and 'Label' columns")

# Map labels to strings
label_map = {1: 'POSITIVE', 0: 'NEGATIVE'}

# Run predictions
true_labels = []
predicted_labels = []
sequence_list = []

print(f"\nüß† Running predictions on {len(test_df)} samples...")

for idx, row in test_df.iterrows():
    seq = row['Seq']
    true_label = label_map.get(row['Label'], 'UNKNOWN')
    
    pred = predict_label(seq)
    
    true_labels.append(true_label)
    predicted_labels.append(pred)
    sequence_list.append(seq)
    
    # Progress indicator + sample output
    if idx < 10:  # Show first 10 samples
        print(f"\nSample {idx+1}:")
        print(f"Sequence: {seq[:30]}...")  # Truncate long sequences
        print(f"True:     {true_label}")
        print(f"Predicted: {pred}")

# Create DataFrame with all predictions
results_df = pd.DataFrame({
    'Sequence': sequence_list,
    'True_Label': true_labels,
    'Predicted_Label': predicted_labels
})

# Save results to CSV
results_df.to_csv('/content/predictions.csv', index=False)

# Compute metrics
try:
    f1 = f1_score(true_labels, predicted_labels, pos_label='POSITIVE', average='binary')
except:
    f1 = 0.0
    
try:
    mcc = matthews_corrcoef(true_labels, predicted_labels)
except:
    mcc = 0.0
    
try:
    precision = precision_score(true_labels, predicted_labels, pos_label='POSITIVE', average='binary')
except:
    precision = 0.0
    
try:
    recall = recall_score(true_labels, predicted_labels, pos_label='POSITIVE', average='binary')
except:
    recall = 0.0
    
try:
    avg_f1 = f1_score(true_labels, predicted_labels, average='macro')
except:
    avg_f1 = 0.0

# Output in requested format
output_line = f"{model_path},{f1:.4f},{mcc:.4f},{precision:.4f},{recall:.4f},{avg_f1:.4f}"

# Print results
print("\n\nüìã Final Benchmark Results (One-line format):")
print(output_line)

print(f"\n‚ö†Ô∏è Unknown predictions: {unknown_count} / {len(test_df)}")
print("These were defaulted to 'NEGATIVE' for metric calculation.")

print("\nüìé Predictions saved to /content/predictions.csv")
print("You can download this file from the Colab sidebar or run:")
print("!download predictions.csv")

üöÄ Using device: cuda
GPU Available: True
GPU Name: NVIDIA L40S


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`



üîç Tokenizer Info:
Special tokens: {'bos_token': '<startoftext>', 'eos_token': '<endoftext>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>'}
Vocabulary size: 50264
Model embeddings: torch.Size([50264, 1280])

üìÅ Upload your test CSV file (must contain 'Seq' and 'Label' columns):

üß† Running predictions on 535 samples...

Sample 1:
Sequence: MIEDIGESDSPIPLPNVTSTI...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 2:
Sequence: CLLLFPLTAVPMDGDQPADRP...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 3:
Sequence: GWTSNPEELDPIRLALLGKSG...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 4:
Sequence: SSADRAANRLPGFGVITNIIN...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 5:
Sequence: EVEVDPITTFPLKGLTPLTEY...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 6:
Sequence: TDVTGRVLQPPSILYGGRNKA...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 7:
Sequence: GGVEEVPLAQPESKRDILFLF...
True:     NEGATIVE
Predicted: NEGATIVE

Sample 8:
Sequence: AKQSGEYWIDPNQGSVEDAIK...
True:     NEGATIVE
Predic

# Model Setup Differences  

In my implementation, I utilized Parameter-Efficient Fine-Tuning (PEFT)  with low-rank adapters, where only a small subset of parameters (adapter weights + lm_head) were trained, while the majority of the base model remained frozen. This choice prioritized computational efficiency but restricted the model‚Äôs ability to deeply adapt its architecture to the task. In contrast, the paper employed full fine-tuning , updating all model parameters‚Äîincluding attention mechanisms, feed-forward networks, and embeddings‚Äîto enable comprehensive adaptation to task-specific patterns. Additionally, the paper did'nt use adapters, allowing greater expressiveness, whereas my implementation relied on extremely low-rank adapters (e.g., rank=8), further limiting the model‚Äôs capacity to capture complex relationships. Notably, neither the paper nor my implementation explicitly addressed class imbalance  in the dataset. Both approaches worked with a dataset containing a large number of negative classes . However, the paper‚Äôs full fine-tuning likely mitigated this imbalance implicitly through its ability to adapt contextual features across all layers, while my PEFT setup exacerbated the issue due to shallow training and limited parameter updates. While the paper‚Äôs exact training duration is unspecified, I trained my model for an extensive 728.27 epochs. This suggests my model was trained extensively, yet performance remains suboptimal‚Äîa critical observation that shifts the focus from training duration to structural limitations in my PEFT setup. This is likely due to the nature of Unsupervised Fine-tuning. 

# Performance Metrics Comparison  

Despite running my implementation for 728.27 epochs , the performance gap compared to the paper‚Äôs results remains stark. The paper reported an F1-Score of 92.30% , MCC of 89.89% , and Recall of 88.73% , reflecting a well-balanced model that effectively identifies both positive and negative cases. In contrast, my implementation achieved an F1-Score of 22.50% , MCC of 31.04% , and Recall of 12.68% , indicating severe degradation in capturing positive instances. While my model‚Äôs Precision of 100%  suggests it avoids false positives, this comes at the cost of missing most true positives‚Äîa direct consequence of shallow training and low-rank adapters, compounded by the dataset‚Äôs inherent class imbalance. These metrics highlight the limitations of PEFT in resource-constrained scenarios, where frozen layers and low-rank adapters hinder the model‚Äôs ability to learn nuanced task-specific patterns‚Äîeven after hundreds of epochs. Although there is the fact that the paper may have run even more epochs

# Root Causes of Discrepancies  

The primary driver of the performance gap lies in limited parameter adaptation . Full fine-tuning allows the model to reconfigure all layers for the task, whereas my PEFT setup restricted updates to low-rank adapters, compressing information into a narrow subspace. For instance, a rank-8 adapter reduces a weight matrix from 1024√ó1024 to two smaller matrices (1024√ó8 and 8√ó1024), discarding critical information. Additionally, training only the lm_head forced my model to rely on pre-trained contextual features, which may not align with the task‚Äôs requirements. The large number of negative classes in the dataset further exacerbated the issue, as my model prioritized minimizing false positives, leading to poor recall. Crucially, even though I trained for 728.27 epochs , the performance stagnation indicates that training duration alone cannot compensate for structural limitations  like low-rank adapters or frozen layers. My model simply may have lacked the capacity to learn meaningful patterns due to constrained parameter updates, regardless of how long it trained. Meanwhile, the paper‚Äôs full fine-tuning likely leveraged its ability to adapt all layers‚Äîincluding attention mechanisms and FFN modules‚Äîto implicitly balance the dataset‚Äôs skewed distribution, even without explicit mitigation strategies.  

In [7]:
trainer.train(resume_from_checkpoint="/teamspace/studios/this_studio/results_checkpoints/checkpoint-12500")

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
13000,4.5157
13500,4.5266
14000,4.5309
14500,4.5193
15000,4.5218
15500,4.5281
16000,4.5164
16500,4.5184
17000,4.5298
17500,4.5139




KeyboardInterrupt: 

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,4.5482
1000,4.5533
1500,4.5706
2000,4.5517
2500,4.5497
3000,4.5568
3500,4.5461
4000,4.5448
4500,4.5462
5000,4.5434


