In [1]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
# import dependencies
import wandb                     # For experiment tracking. Used for experiment tracking, which helps log metrics and visualizations during training.
import torch                   # For tensor computations and model training
import datasets                # For working with HuggingFace datasets
import argparse                # For parsing command line arguments (if needed)
import numpy as np             # For numerical operations
import transformers            # HuggingFace Transformers library

# Additional imports for our implementation
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, default_data_collator
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import math

In [3]:
# Define our model and dataset to use
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct" #we use the same pre-trained model, which we used in the first task
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity" # we use the same dataset which we used in the first task(without the external dataset)

# Define the regression model with a regression head

This section effectively adapts a pre-trained transformer model for a regression task by attaching a regression head that takes the hidden representation (specifically, the [CLS] token) and maps it to a single scalar output.

In [4]:
class MoLFormerWithRegressionHead(nn.Module): # we create a new class that extends nn.Module
    def __init__(self, model_name):
        super(MoLFormerWithRegressionHead, self).__init__()
        # Load the pre-trained MoLFormer base model.
        self.base_model = AutoModel.from_pretrained(model_name, deterministic_eval=True, trust_remote_code=True)
        # Retrieve the hidden size from the model configuration.
        hidden_size = self.base_model.config.hidden_size
        # Define a linear layer as the regression head to map hidden states to a single output.
        self.regression_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        # Forward pass through the base model.
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the representation of the [CLS] token (assumed to be the first token).
        cls_representation = outputs.last_hidden_state[:, 0, :]
        # Pass the [CLS] representation through the regression head.
        regression_output = self.regression_head(cls_representation)
        return regression_output.squeeze(-1)  # Remove extra dimensions if necessary

# Tokenization function for SMILES strings

The function tokenize_function is designed to process SMILES strings  so that they can be fed into the model. Tokenization converts raw SMILES strings into a numerical format (token IDs) that the model can work with.

In [5]:
def tokenize_function(examples):
    return tokenizer(
        examples["SMILES"],        # Use the SMILES string column.
        padding="max_length",      # Pad sequences to the maximum length.
        truncation=True,           # Truncate sequences longer than max_length.
        max_length=128,            # Maximum sequence length.
        return_attention_mask=True )# Include attention masks.

# Entry point

In [6]:
if __name__ == "__main__":
    # Initialize wandb for experiment tracking (optional).
    wandb.init(project="bitfit_finetuning", name="bitfit_molformer_regression")

    # Set the device: GPU if available, else CPU.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # -----------------------------
    # Load and Prepare the Dataset
    # -----------------------------

    # Load the Lipophilicity dataset from HuggingFace.
    raw_dataset = load_dataset(DATASET_PATH)

    # Check if a 'test' split exists; if not, create one from the 'train' split.
    if "test" not in raw_dataset:
        # Create an 80/20 train/test split from the original training data.
        split_dataset = raw_dataset["train"].train_test_split(test_size=0.2, seed=42)
        # Construct a new DatasetDict with 'train' and 'test' splits.
        dataset = DatasetDict({
            "train": split_dataset["train"],
            "test": split_dataset["test"]
        })
    else:
        dataset = raw_dataset

    # Load the tokenizer associated with our MoLFormer model.
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

    # Tokenize the dataset using our tokenization function.
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Rename the label column to "labels" for consistency with our training loop.
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    # Set the format of the dataset to PyTorch tensors.
    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    # Create DataLoaders for the training and test sets.
    BATCH_SIZE = 16  # Adjust based on your memory constraints.
    train_loader = DataLoader(tokenized_datasets["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=default_data_collator)
    test_loader = DataLoader(tokenized_datasets["test"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=default_data_collator)

    # -----------------------------
    # Initialize the Regression Model
    # -----------------------------

    # Create an instance of the regression model and move it to the chosen device.
    model = MoLFormerWithRegressionHead(MODEL_NAME).to(device)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33momarfajjalstudy[0m ([33momarfajjalstudy-saarland-university-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

lipophilicity.csv:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4200 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenization_molformer_fast.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

tokenization_molformer.py:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer_fast.py
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/3360 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/187M [00:00<?, ?B/s]

# Apply BitFit Fine-Tuning Strategy

In [7]:
    # Freeze all parameters in the base model except for bias terms.
    for name, param in model.base_model.named_parameters():
        if "bias" not in name:  # Only allow bias parameters to be trainable.
            param.requires_grad = False

    # print out which parameters are trainable.
    trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
    print("Trainable parameters:", trainable_params)

Trainable parameters: ['base_model.encoder.layer.0.attention.self.query.bias', 'base_model.encoder.layer.0.attention.self.key.bias', 'base_model.encoder.layer.0.attention.self.value.bias', 'base_model.encoder.layer.0.attention.output.dense.bias', 'base_model.encoder.layer.0.attention.output.LayerNorm.bias', 'base_model.encoder.layer.0.intermediate.dense.bias', 'base_model.encoder.layer.0.output.dense.bias', 'base_model.encoder.layer.0.output.LayerNorm.bias', 'base_model.encoder.layer.1.attention.self.query.bias', 'base_model.encoder.layer.1.attention.self.key.bias', 'base_model.encoder.layer.1.attention.self.value.bias', 'base_model.encoder.layer.1.attention.output.dense.bias', 'base_model.encoder.layer.1.attention.output.LayerNorm.bias', 'base_model.encoder.layer.1.intermediate.dense.bias', 'base_model.encoder.layer.1.output.dense.bias', 'base_model.encoder.layer.1.output.LayerNorm.bias', 'base_model.encoder.layer.2.attention.self.query.bias', 'base_model.encoder.layer.2.attention.sel

# Fine-Tune on the Regression Task

In [8]:
    # Define the loss function (Mean Squared Error for regression).
    criterion = nn.MSELoss()
    # Create an optimizer that only updates trainable parameters.
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
    num_epochs = 10  # Adjust the number of epochs as needed.

    model.train()  # Set the model to training mode.
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in train_loader:
            # Move input data and labels to the device.
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device).float()  # Ensure labels are float.

            optimizer.zero_grad()  # Reset gradients.
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)  # Forward pass.
            loss = criterion(outputs, labels)  # Compute the loss.
            loss.backward()  # Backpropagate.
            optimizer.step()  # Update only trainable parameters.

            running_loss += loss.item()

        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
        wandb.log({"epoch": epoch+1, "loss": avg_loss})

Epoch [1/10], Loss: 1.8113
Epoch [2/10], Loss: 1.2798
Epoch [3/10], Loss: 1.2021
Epoch [4/10], Loss: 1.1280
Epoch [5/10], Loss: 1.0759
Epoch [6/10], Loss: 1.0144
Epoch [7/10], Loss: 0.9745
Epoch [8/10], Loss: 0.9400
Epoch [9/10], Loss: 0.9146
Epoch [10/10], Loss: 0.8840


# Save the Fine-Tuned Model

In [9]:
BITFIT_MODEL_SAVE_PATH = "bitfit_molformer_regression.pth"
torch.save(model.state_dict(), BITFIT_MODEL_SAVE_PATH)
print(f"BitFit fine-tuned model saved to {BITFIT_MODEL_SAVE_PATH}")

BitFit fine-tuned model saved to bitfit_molformer_regression.pth


 # Evaluate the Model on the Test Set

In [10]:
    model.eval()  # Set the model to evaluation mode.
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device).float()

            #The fine-tuned model makes predictions on the test batch.
            #The model outputs a single predicted value for each molecule.

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    predictions = np.array(predictions)
    actuals = np.array(actuals)
    mse = np.mean((predictions - actuals)**2)  # Compute Mean Squared Error.
    print(f"Test Mean Squared Error (MSE): {mse:.4f}")
    wandb.log({"test_mse": mse})

    # Finish the wandb run.
    wandb.finish()

Test Mean Squared Error (MSE): 0.8616


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▃▂▂▂▁▁▁
test_mse,▁

0,1
epoch,10.0
loss,0.88397
test_mse,0.86165
