In [1]:
# Import dependencies
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset, random_split
from datasets import Dataset
import torch.nn.functional as F
from datasets import load_dataset
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Dataset and model paths
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
EXTERNAL_DATASET_PATH = "C:/Users/jadej/OneDrive/Desktop/Studies/NNTI 11th Feb/Project/Project_Files/tasks/External-Dataset_for_Task2.csv"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# load the dataset from HuggingFace
dataset = load_dataset(DATASET_PATH)

In [5]:
# Calculate the maximum length of SMILES strings in the dataset
max_length = max(len(smile) for smile in dataset['train']['SMILES'])
print(f"Max SMILES length: {max_length}")



# define a PyTorch Dataset class for handling SMILES strings and targets

# TODO: your code goes here
# class SMILESDataset(Dataset):


class SMILEDATASET(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.rename(columns={'label': 'Label'})  # Standardize key names

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        return {
            'SMILES': sample['SMILES'],
            'Label': sample['Label']  # Ensure correct key mapping
        }

Max SMILES length: 267


In [6]:
# Load external dataset
ext_data = pd.read_csv(EXTERNAL_DATASET_PATH)
# Load the pre-trained model and tokenizer
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ext_data = ext_data1.head(16)

In [7]:
ext_data.head()

Unnamed: 0,SMILES,Label
0,CCCCN1Cc2c(nc3cc(-c4ccco4)nn3c2O)C1=O,1.548
1,Cc1cc(C)c2c(n1)sc1c2ncnc1N1CCN(C)CC1,2.568
2,COC(=O)[C@H]1[C@H]2CC[C@H](C[C@@H]1OC(=O)c1ccc...,0.102
3,Nc1nonc1/C(=N/O)Nc1ccc(F)c(Cl)c1,2.45
4,Cc1c[nH]c(/C=C2/C(=O)Nc3ccccc32)c1CCC(=O)O,1.04


In [8]:

# Load pre-trained model and tokenizer

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model.eval()


Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MolformerForSequenceClassification(
  (molformer): MolformerModel(
    (embeddings): MolformerEmbeddings(
      (word_embeddings): Embedding(2362, 768, padding_idx=2)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): MolformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x MolformerLayer(
          (attention): MolformerAttention(
            (self): MolformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (rotary_embeddings): MolformerRotaryEmbedding()
              (feature_map): MolformerFeatureMap(
                (kernel): ReLU()
              )
            )
            (output): MolformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [9]:
def compute_gradients(model, smiles, targets):
    """Compute gradients of loss with respect to model parameters."""
    model.zero_grad()

    # ðŸ”¹ Ensure smiles is a string, not a tensor/dict
    if isinstance(smiles, torch.Tensor):
        smiles = smiles.item()  # Convert single tensor to string
    elif isinstance(smiles, list) and isinstance(smiles[0], torch.Tensor):
        smiles = [s.item() for s in smiles]  # Convert list of tensors to strings
    
    inputs = smiles
    # ðŸ”¹ Ensure targets are in tensor format
    # targets = torch.tensor([targets], dtype=torch.float32)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    targets = targets.to(device)

    
    # ðŸ”¹ Forward pass
    # outputs = model(**inputs)
    outputs = model(input_ids, attention_mask)

    # ðŸ”¹ Compute loss
    loss_fn = torch.nn.MSELoss()
    loss = loss_fn(outputs.logits.squeeze(), targets)

    # ðŸ”¹ Compute gradients
    loss.backward()
    return [p.grad for p in model.parameters() if p.grad is not None]

In [10]:
def lissa_approximation(model, test_grad, train_data, iters=100, damping=0.01):
    ihvp = [torch.zeros_like(tg) for tg in test_grad]

    for _ in range(iters):
        random_idx = random.randint(0, len(train_data) - 1)
        train_sample = train_data[random_idx]  # Get one sample

        # Debug print to check data
        # print(f"Sample retrieved: {train_sample}")

        smiles_input = train_sample.get('SMILES', None)
        label_value = train_sample.get('label', None)  # Change from 'Label' to 'label'
        # print("smiles lissa ", smiles_input)
        inputs = tokenizer(train_sample['SMILES'], return_tensors="pt")
        label_value = torch.tensor([train_sample['label']])
        # Safety check: If label or SMILES is missing, raise an error
        if smiles_input is None or label_value is None:
            raise KeyError(f"Missing data in sample: {train_sample}")

        train_grad = compute_gradients(model, inputs, label_value)
        ihvp = [g - damping * ih for g, ih in zip(train_grad, ihvp)]
    
    return ihvp

In [11]:
def compute_influence_scores(model, external_data, train_data):
    """Compute influence scores for each external data point."""
    influence_scores = []
    for _, row in tqdm(external_data.iterrows(), total=len(external_data)):
        # Tokenize input molecule SMILES string
        inputs = tokenizer(row['SMILES'], return_tensors="pt")
        target = torch.tensor([row['Label']])
        # print("done")
        
        # Compute test gradient
        test_grad = compute_gradients(model, inputs, target)

        # Approximate inverse Hessian-vector product
        ihvp = lissa_approximation(model, test_grad, train_data)

        # Compute influence score (dot product of test gradient and ihvp)
        influence = sum(torch.dot(tg.flatten(), ih.flatten()) for tg, ih in zip(test_grad, ihvp))
        influence_scores.append((row['SMILES'], row['Label'], influence.item()))

    return sorted(influence_scores, key=lambda x: x[1], reverse=True)

In [12]:
# Load the training split of the Lipophilicity dataset (SMILEDATASET)
SMILEDATASET = load_dataset("scikit-fingerprints/MoleculeNet_Lipophilicity", split="train")

# Assign SMILEDATASET to train_data
train_data = SMILEDATASET

In [13]:
# Compute influence scores
influence_scores = compute_influence_scores(model, ext_data, train_data)

# Select top-48 influential samples
selected_samples = influence_scores[:48]

  return F.mse_loss(input, target, reduction=self.reduction)
  0%|                                                                                          | 0/300 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
inf_df = pd.DataFrame(influence_scores, columns=["Smiles", "Label", "Influence"])
inf_df.to_csv("Influence.csv", index=False)

In [None]:
len(influence_scores)

In [None]:
# selected_samples

In [32]:
# Load dataset
influence_df = pd.read_csv("Influence.csv")  # Ensure the correct file path

# Select the top 100 SMILES strings and corresponding Labels based on Influence score
top_100 = influence_df.nlargest(100, 'Influence')[["Smiles", "Label"]]  # Extract only required columns

# Convert to lists
top_100_smiles = top_100["Smiles"].tolist()
top_100_labels = top_100["Label"].tolist()



class TopInfluenceDataset(torch.utils.data.Dataset):
    def __init__(self, smiles_list, labels_list, tokenizer, max_length=268):
        self.smiles_list = smiles_list
        self.labels_list = labels_list  # Store labels
        self.tokenizer = tokenizer
        self.max_length = max_length  # Set max length for consistent padding

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        label = torch.tensor(self.labels_list[idx], dtype=torch.float32)  # Convert label to tensor

        encoded = self.tokenizer(
            smiles,
            padding="max_length",  # Ensure all sequences have the same length
            truncation=True,  # Truncate if sequence exceeds max_length
            max_length=self.max_length,
            return_tensors="pt",
        )
        tokenized_data = {key: val.squeeze(0) for key, val in encoded.items()}  # Remove batch dim
        tokenized_data["label"] = label  # Add label to dictionary

        return tokenized_data

    def __len__(self):
        return len(self.smiles_list)


# Create dataset instance
top_100_data_loader = torch.utils.data.DataLoader(
    TopInfluenceDataset(top_100_smiles, top_100_labels, tokenizer),  # Pass only the SMILES list
    batch_size=16,
    shuffle=True,
    # collate_fn=collate_fn  # Ensure proper batching
)

# Define batch size
BATCH_SIZE = 16


# Split train_data into train (80%) and test (20%) sets
train_size = int(0.8 * len(train_data))
test_size = len(train_data) - train_size

train_dataset, test_dataset = random_split(train_data, [train_size, test_size])

# Create DataLoaders for train and test data
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Print dataset sizes
# print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}, Top 100 dataset: {len(top_100_dataset)}")

In [28]:
test_dataset_prepared

<__main__.TopInfluenceDataset at 0x1bc166cf8c0>

In [21]:
# Define Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training Loop
num_epochs = 5  # Adjust if needed
best_loss = float('inf')  # Initialize best loss to a large value

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    progress_bar = tqdm(top_100_data_loader, desc=f"Fine-Tuning Regression (MLM) Epoch {epoch+1}/{num_epochs}", leave=False)
    # print(top_100_data_loader)
    for batch in top_100_data_loader:
        # print(batch)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        # print(input_id)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
       
       # Extract the predicted values corresponding to the true labels
        outputs_reshaped = outputs.logits[:, 0] # Flatten logits
        loss = criterion(outputs_reshaped, labels.float())  # Ensure labels are float for MSELoss

        # loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        progress_bar.set_postfix({"Training Loss": loss.item()})

    avg_epoch_loss = epoch_loss / len(top_100_data_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_epoch_loss:.4f}")


                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>



Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                                | 0/7 [00:07<?, ?it/s, Training Loss=1.07][A
Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                               | 0/7 [00:13<?, ?it/s, Training Loss=0.821][A
Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                               | 0/7 [00:20<?, ?it/s, Training Loss=0.404][A
Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                               | 0/7 [00:27<?, ?it/s, Training Loss=0.258][A
Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                               | 0/7 [00:34<?, ?it/s, Training Loss=0.567][A
Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                                | 0/7 [00:41<?, ?it/s, Training Loss=0.36][A
Fine-Tuning Regression (MLM) Epoch 1/7:   0%|                               | 0/7 [00:43<?, ?it/s, Training Loss=0.588][A

Epoch 1/7, Average Loss: 0.5815


Fine-Tuning Regression (MLM) Epoch 2/7:   0%|                                                    | 0/7 [00:00<?, ?it/s]
                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>


Fine-Tuning Regression (MLM) Epoch 2/7:   0%|                               | 0/7 [00:43<?, ?it/s, Training Loss=0.323]

Epoch 2/7, Average Loss: 0.3182



                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>



Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                               | 0/7 [00:06<?, ?it/s, Training Loss=0.117][A
Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                               | 0/7 [00:13<?, ?it/s, Training Loss=0.424][A
Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                               | 0/7 [00:20<?, ?it/s, Training Loss=0.231][A
Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                               | 0/7 [00:27<?, ?it/s, Training Loss=0.178][A
Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                               | 0/7 [00:34<?, ?it/s, Training Loss=0.126][A
Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                                | 0/7 [00:41<?, ?it/s, Training Loss=0.08][A
Fine-Tuning Regression (MLM) Epoch 3/7:   0%|                               | 0/7 [00:43<?, ?it/s, Training Loss=0.233][A

Epoch 3/7, Average Loss: 0.1984


Fine-Tuning Regression (MLM) Epoch 4/7:   0%|                                                    | 0/7 [00:00<?, ?it/s]
                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>


Fine-Tuning Regression (MLM) Epoch 4/7:   0%|                              | 0/7 [00:43<?, ?it/s, Training Loss=0.0497]

Epoch 4/7, Average Loss: 0.1057



                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>



Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                               | 0/7 [00:06<?, ?it/s, Training Loss=0.136][A
Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                              | 0/7 [00:13<?, ?it/s, Training Loss=0.0448][A
Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                              | 0/7 [00:20<?, ?it/s, Training Loss=0.0295][A
Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                              | 0/7 [00:27<?, ?it/s, Training Loss=0.0346][A
Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                               | 0/7 [00:34<?, ?it/s, Training Loss=0.039][A
Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                              | 0/7 [00:40<?, ?it/s, Training Loss=0.0755][A
Fine-Tuning Regression (MLM) Epoch 5/7:   0%|                               | 0/7 [00:43<?, ?it/s, Training Loss=0.144][A

Epoch 5/7, Average Loss: 0.0719


Fine-Tuning Regression (MLM) Epoch 6/7:   0%|                                                    | 0/7 [00:00<?, ?it/s]
                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>


Fine-Tuning Regression (MLM) Epoch 6/7:   0%|                               | 0/7 [00:43<?, ?it/s, Training Loss=0.031]

Epoch 6/7, Average Loss: 0.0478



                                                                                                                       [A

<torch.utils.data.dataloader.DataLoader object at 0x000001BB33753320>



Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:06<?, ?it/s, Training Loss=0.0325][A
Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:13<?, ?it/s, Training Loss=0.0501][A
Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:20<?, ?it/s, Training Loss=0.0466][A
Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:27<?, ?it/s, Training Loss=0.0556][A
Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:34<?, ?it/s, Training Loss=0.0474][A
Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:40<?, ?it/s, Training Loss=0.0276][A
Fine-Tuning Regression (MLM) Epoch 7/7:   0%|                              | 0/7 [00:43<?, ?it/s, Training Loss=0.0621][A

Epoch 7/7, Average Loss: 0.0460


In [29]:
test_data_loader

<torch.utils.data.dataloader.DataLoader at 0x1bb634c9ca0>

In [37]:
# Define Loss and Optimizer for eval
    # === EVALUATION PHASE ===
model.eval()
eval_loss = 0.0
with torch.no_grad():  # Disable gradient calculations
    for batch in test_data_loader:  # Use test data
        # print(batch)
        # input_ids = batch['input_ids'].to(device)
        # attention_mask = batch['attention_mask'].to(device)
        # labels = batch['label'].to(device)

        smiles_list = batch["SMILES"]  # Extract SMILES strings
        labels = batch["label"].to(torch.float).to(device)  # Convert labels to tensors
        
        encoding = tokenizer(
            smiles_list,
            padding="max_length",
            truncation=True,
            max_length=128,  # Adjust max_length as needed
            return_tensors="pt",
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        
        outputs = model(input_ids, attention_mask)
        outputs_reshaped = outputs.logits[:, 0] # Flatten logits
        loss = criterion(outputs_reshaped, labels)
        eval_loss += loss.item()

avg_eval_loss = eval_loss / len(test_data_loader)

# Track Best Model
if avg_eval_loss < best_loss:
    best_loss = avg_eval_loss
    torch.save(model.state_dict(), "best_model.pth")  # Save best model

print(f"Epoch {epoch+1}/{num_epochs} | Eval Loss: {avg_eval_loss:.4f}")

print(f"Best Evaluation Loss: {best_loss:.4f}")

Epoch 7/7 | Eval Loss: 1.5043
Best Evaluation Loss: 1.4986


In [36]:

print(f"Epoch {epoch+1}/{num_epochs} | Eval Loss: {avg_eval_loss:.4f}")

print(f"Best Evaluation Loss: {best_loss:.4f}")

Epoch 7/7 | Eval Loss: 1.4986
Best Evaluation Loss: 1.4986
