In [3]:
import wandb
wandb.login(key="890ed8f47f5a7fee963464f6935383bd0cd6ede7")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33meslamhelala6[0m ([33meslamhelala6-saarland-informatics-campus[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
import pandas as pd


In [5]:
import os
import torch
import pandas as pd
import argparse
import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import wandb

dataset = pd.read_csv("/kaggle/input/external-dataset-for-task2-csv/External-Dataset_for_Task2.csv")

# Enable CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

# Define model and dataset paths
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
DATA_PATH = "/kaggle/input/external-dataset-for-task2-csv/External-Dataset_for_Task2.csv"

# Load dataset and handle potential errors
try:
    dataset = pd.read_csv(DATA_PATH)
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit(1)

# Convert labels to binary using the mean value as a threshold
threshold = dataset["Label"].mean()
dataset["Label"] = (dataset["Label"] >= threshold).astype(int)

# Load tokenizer from the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Convert dataset to Hugging Face format and rename the text column
dataset = Dataset.from_pandas(dataset).rename_columns({"SMILES": "text"})

# Process labels to match model requirements
def preprocess_labels(example):
    return {"labels": int(example["Label"])}

dataset = dataset.map(preprocess_labels)

# Tokenize dataset using max length and truncation settings
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, trust_remote_code=True
).to(device)

# Shuffle and split dataset into training (80%) and evaluation (20%) sets
dataset = dataset.shuffle(seed=42)
split_idx = int(0.8 * len(dataset))
train_dataset = dataset.select(range(split_idx))
eval_dataset = dataset.select(range(split_idx, len(dataset)))

# Fine-tuning function supporting multiple strategies
def fine_tune_model(model, train_dataset, eval_dataset, strategy="lora"):
    # Ensure WandB does not create duplicate runs
    if wandb.run is not None:
        wandb.finish()

    # Initialize WandB for tracking experiments
    wandb.init(project="NNTI-Task3", name=f"{strategy}-training", sync_tensorboard=True)

    # Define training parameters based on strategy
    num_epochs = 30 if strategy == "bitfit" else 20
    learning_rate = 2e-5 if strategy == "bitfit" else 1e-5 if strategy == "lora" else 5e-6

    training_args = TrainingArguments(
        output_dir=f"/kaggle/working/models/fine_tuned_{strategy}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        fp16=False,
        remove_unused_columns=False,
        report_to="wandb",
    )

    # Apply fine-tuning strategy
    if strategy == "bitfit":
        # Enable training only on bias parameters
        for name, param in model.named_parameters():
            if "bias" not in name:
                param.requires_grad = False

    elif strategy == "lora":
        from peft import get_peft_model, LoraConfig
        peft_config = LoraConfig(r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.1)
        model = get_peft_model(model, peft_config)

    elif strategy == "ia3":
        from peft import get_peft_model, IA3Config

        # Retrieve available module names for debugging
        available_modules = [name for name, _ in model.named_modules()]
        print(f"Available Modules: {available_modules[:20]}")  # Print first 20 modules for reference

        # Define target and feedforward modules based on model architecture
        target_modules = [m for m in ["query", "value", "dense", "dense2"] if any(m in name for name in available_modules)]
        feedforward_modules = [m for m in ["dense", "dense2"] if m in target_modules]  # Feedforward must be subset of target

        if not target_modules:
            raise ValueError(f"No valid IA3 target modules found! Available: {available_modules[:20]}")

        print(f"IA3 Target Modules: {target_modules}")
        print(f"IA3 Feedforward Modules: {feedforward_modules}")

        peft_config = IA3Config(target_modules=target_modules, feedforward_modules=feedforward_modules)
        model = get_peft_model(model, peft_config)

    # Initialize Trainer for model fine-tuning
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    print(f"Starting training for {strategy}...")
    trainer.train()
    
    # Evaluate model performance after training
    final_metrics = trainer.evaluate()
    wandb.log(final_metrics)
    wandb.finish()
    
    return model

# Main execution block
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--strategy", type=str, choices=["bitfit", "lora", "ia3"], default="lora")
    args, unknown = parser.parse_known_args()

    # Train model based on selected fine-tuning strategy
    fine_tuned_model = fine_tune_model(model, train_dataset, eval_dataset, strategy=args.strategy)

    # Save model with a unique timestamp for reproducibility
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = f"/kaggle/working/models/fine_tuned_{args.strategy}_{timestamp}"

    os.makedirs(save_path, exist_ok=True)
    fine_tuned_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model saved at: {save_path}")


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenization_molformer_fast.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

tokenization_molformer.py:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer_fast.py
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/187M [00:00<?, ?B/s]

Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Starting training for lora...


Epoch,Training Loss,Validation Loss
1,0.7554,No log
2,0.7026,No log
3,0.7206,No log
4,0.6814,No log
5,0.7634,No log
6,0.7634,No log
7,0.7028,No log
8,0.7006,No log
9,0.7126,No log
10,0.7097,No log


0,1
epoch,▁
eval/runtime,▂▆▄▃▃▂▁▃▂▂▁▂▁▂▄▇▅█▁▂▂
eval/samples_per_second,▇▃▅▆▆▇█▆▇▇█▇█▇▅▂▄▁█▇▇
eval/steps_per_second,▇▃▅▆▆▇█▆▇▇█▇█▇▅▂▄▁█▇▇
eval_runtime,▁
eval_samples_per_second,▁
eval_steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇█
train/grad_norm,▃▃▂▂▃▁▂▁▁▃▅▂▆▁▂▁▂▂▂▄▂█▅▂▅▂▂▂▅▃▂▂█▂▂▆▂▂▄▂

0,1
epoch,20.0
eval/runtime,0.2421
eval/samples_per_second,247.801
eval/steps_per_second,33.04
eval_runtime,0.2421
eval_samples_per_second,247.801
eval_steps_per_second,33.04
total_flos,162344108851200.0
train/epoch,20.0
train/global_step,600.0


Model saved at: /kaggle/working/models/fine_tuned_lora_20250310_152356


In [6]:
import os
import torch
import pandas as pd
import argparse
import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import wandb

# Enable CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

# Define model and dataset paths
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
DATA_PATH = "/kaggle/input/external-dataset-for-task2-csv/External-Dataset_for_Task2.csv"

# Load dataset and handle potential errors
try:
    dataset = pd.read_csv(DATA_PATH)
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit(1)

# Convert labels to binary using the mean value as a threshold
threshold = dataset["Label"].mean()
dataset["Label"] = (dataset["Label"] >= threshold).astype(int)

# Load tokenizer from the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Convert dataset to Hugging Face format and rename the text column
dataset = Dataset.from_pandas(dataset).rename_columns({"SMILES": "text"})

# Process labels to match model requirements
def preprocess_labels(example):
    return {"labels": int(example["Label"])}

dataset = dataset.map(preprocess_labels)

# Tokenize dataset using max length and truncation settings
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, trust_remote_code=True
).to(device)

# Shuffle and split dataset into training (80%) and evaluation (20%) sets
dataset = dataset.shuffle(seed=42)
split_idx = int(0.8 * len(dataset))
train_dataset = dataset.select(range(split_idx))
eval_dataset = dataset.select(range(split_idx, len(dataset)))

# Fine-tuning function supporting multiple strategies
def fine_tune_model(model, train_dataset, eval_dataset, strategy="bitfit"):
    # Ensure WandB does not create duplicate runs
    if wandb.run is not None:
        wandb.finish()

    # Initialize WandB for tracking experiments
    wandb.init(project="NNTI-Task3", name=f"{strategy}-training", sync_tensorboard=True)

    # Define training parameters based on strategy
    num_epochs = 30 if strategy == "bitfit" else 20
    learning_rate = 2e-5 if strategy == "bitfit" else 1e-5 if strategy == "lora" else 5e-6

    training_args = TrainingArguments(
        output_dir=f"/kaggle/working/models/fine_tuned_{strategy}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        fp16=False,
        remove_unused_columns=False,
        report_to="wandb",
    )

    # Apply fine-tuning strategy
    if strategy == "bitfit":
        # Enable training only on bias parameters
        for name, param in model.named_parameters():
            if "bias" not in name:
                param.requires_grad = False

    elif strategy == "lora":
        from peft import get_peft_model, LoraConfig
        peft_config = LoraConfig(r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.1)
        model = get_peft_model(model, peft_config)

    elif strategy == "ia3":
        from peft import get_peft_model, IA3Config

        # Retrieve available module names for debugging
        available_modules = [name for name, _ in model.named_modules()]
        print(f"Available Modules: {available_modules[:20]}")  # Print first 20 modules for reference

        # Define target and feedforward modules based on model architecture
        target_modules = [m for m in ["query", "value", "dense", "dense2"] if any(m in name for name in available_modules)]
        feedforward_modules = [m for m in ["dense", "dense2"] if m in target_modules]  # Feedforward must be subset of target

        if not target_modules:
            raise ValueError(f"No valid IA3 target modules found! Available: {available_modules[:20]}")

        print(f"IA3 Target Modules: {target_modules}")
        print(f"IA3 Feedforward Modules: {feedforward_modules}")

        peft_config = IA3Config(target_modules=target_modules, feedforward_modules=feedforward_modules)
        model = get_peft_model(model, peft_config)

    # Initialize Trainer for model fine-tuning
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    print(f"Starting training for {strategy}...")
    trainer.train()
    
    # Evaluate model performance after training
    final_metrics = trainer.evaluate()
    wandb.log(final_metrics)
    wandb.finish()
    
    return model

# Main execution block
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--strategy", type=str, choices=["bitfit", "lora", "ia3"], default="bitfit")
    args, unknown = parser.parse_known_args()

    # Train model based on selected fine-tuning strategy
    fine_tuned_model = fine_tune_model(model, train_dataset, eval_dataset, strategy=args.strategy)

    # Save model with a unique timestamp for reproducibility
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = f"/kaggle/working/models/fine_tuned_{args.strategy}_{timestamp}"

    os.makedirs(save_path, exist_ok=True)
    fine_tuned_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model saved at: {save_path}")


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Starting training for bitfit...


Epoch,Training Loss,Validation Loss
1,0.7451,0.713842
2,0.75,0.695506
3,0.7447,0.689087
4,0.7001,0.680335
5,0.6981,0.66913
6,0.7201,0.669266
7,0.725,0.672656
8,0.7048,0.670189
9,0.6817,0.674717
10,0.6909,0.664269


0,1
epoch,▁
eval/loss,█▆▅▄▃▃▃▃▃▂▁▁▂▂▂▁▂▂▂▁▂▂▁▂▁▁▁▂▂▁▃
eval/runtime,▂▁▁▂▂▂▃█▂▃▁▁▁▁▁▁▃▁▁▂▁▂▂▃▂▁▂▁▁▂▂
eval/samples_per_second,▇██▆▇▇▆▁▇▆██████▅██▇█▇▇▆▇█▇██▇▇
eval/steps_per_second,▇██▆▇▇▆▁▇▆██████▅██▇█▇▇▆▇█▇██▇▇
eval_loss,▁
eval_runtime,▁
eval_samples_per_second,▁
eval_steps_per_second,▁
train/epoch,▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
epoch,30.0
eval/loss,0.66985
eval/runtime,0.2261
eval/samples_per_second,265.339
eval/steps_per_second,35.378
eval_loss,0.66985
eval_runtime,0.2261
eval_samples_per_second,265.339
eval_steps_per_second,35.378
total_flos,241885417881600.0


Model saved at: /kaggle/working/models/fine_tuned_bitfit_20250310_152549


In [7]:
import os
import torch
import pandas as pd
import argparse
import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import wandb

# Enable CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

# Define model and dataset paths
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
DATA_PATH = "/kaggle/input/external-dataset-for-task2-csv/External-Dataset_for_Task2.csv"

# Load dataset and handle potential errors
try:
    dataset = pd.read_csv(DATA_PATH)
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit(1)

# Convert labels to binary using the mean value as a threshold
threshold = dataset["Label"].mean()
dataset["Label"] = (dataset["Label"] >= threshold).astype(int)

# Load tokenizer from the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Convert dataset to Hugging Face format and rename the text column
dataset = Dataset.from_pandas(dataset).rename_columns({"SMILES": "text"})

# Process labels to match model requirements
def preprocess_labels(example):
    return {"labels": int(example["Label"])}

dataset = dataset.map(preprocess_labels)

# Tokenize dataset using max length and truncation settings
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, trust_remote_code=True
).to(device)

# Shuffle and split dataset into training (80%) and evaluation (20%) sets
dataset = dataset.shuffle(seed=42)
split_idx = int(0.8 * len(dataset))
train_dataset = dataset.select(range(split_idx))
eval_dataset = dataset.select(range(split_idx, len(dataset)))

# Fine-tuning function supporting multiple strategies
def fine_tune_model(model, train_dataset, eval_dataset, strategy="ia3"):
    # Ensure WandB does not create duplicate runs
    if wandb.run is not None:
        wandb.finish()

    # Initialize WandB for tracking experiments
    wandb.init(project="NNTI-Task3", name=f"{strategy}-training", sync_tensorboard=True)

    # Define training parameters based on strategy
    num_epochs = 30 if strategy == "bitfit" else 20
    learning_rate = 2e-5 if strategy == "bitfit" else 1e-5 if strategy == "lora" else 5e-6

    training_args = TrainingArguments(
        output_dir=f"/kaggle/working/models/fine_tuned_{strategy}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        fp16=False,
        remove_unused_columns=False,
        report_to="wandb",
    )

    # Apply fine-tuning strategy
    if strategy == "bitfit":
        # Enable training only on bias parameters
        for name, param in model.named_parameters():
            if "bias" not in name:
                param.requires_grad = False

    elif strategy == "lora":
        from peft import get_peft_model, LoraConfig
        peft_config = LoraConfig(r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.1)
        model = get_peft_model(model, peft_config)

    elif strategy == "ia3":
        from peft import get_peft_model, IA3Config

        # Retrieve available module names for debugging
        available_modules = [name for name, _ in model.named_modules()]
        print(f"Available Modules: {available_modules[:20]}")  # Print first 20 modules for reference

        # Define target and feedforward modules based on model architecture
        target_modules = [m for m in ["query", "value", "dense", "dense2"] if any(m in name for name in available_modules)]
        feedforward_modules = [m for m in ["dense", "dense2"] if m in target_modules]  # Feedforward must be subset of target

        if not target_modules:
            raise ValueError(f"No valid IA3 target modules found! Available: {available_modules[:20]}")

        print(f"IA3 Target Modules: {target_modules}")
        print(f"IA3 Feedforward Modules: {feedforward_modules}")

        peft_config = IA3Config(target_modules=target_modules, feedforward_modules=feedforward_modules)
        model = get_peft_model(model, peft_config)

    # Initialize Trainer for model fine-tuning
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    print(f"Starting training for {strategy}...")
    trainer.train()
    
    # Evaluate model performance after training
    final_metrics = trainer.evaluate()
    wandb.log(final_metrics)
    wandb.finish()
    
    return model

# Main execution block
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--strategy", type=str, choices=["bitfit", "lora", "ia3"], default="ia3")
    args, unknown = parser.parse_known_args()

    # Train model based on selected fine-tuning strategy
    fine_tuned_model = fine_tune_model(model, train_dataset, eval_dataset, strategy=args.strategy)

    # Save model with a unique timestamp for reproducibility
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = f"/kaggle/working/models/fine_tuned_{args.strategy}_{timestamp}"

    os.makedirs(save_path, exist_ok=True)
    fine_tuned_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model saved at: {save_path}")


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Available Modules: ['', 'molformer', 'molformer.embeddings', 'molformer.embeddings.word_embeddings', 'molformer.embeddings.dropout', 'molformer.encoder', 'molformer.encoder.layer', 'molformer.encoder.layer.0', 'molformer.encoder.layer.0.attention', 'molformer.encoder.layer.0.attention.self', 'molformer.encoder.layer.0.attention.self.query', 'molformer.encoder.layer.0.attention.self.key', 'molformer.encoder.layer.0.attention.self.value', 'molformer.encoder.layer.0.attention.self.rotary_embeddings', 'molformer.encoder.layer.0.attention.self.feature_map', 'molformer.encoder.layer.0.attention.self.feature_map.kernel', 'molformer.encoder.layer.0.attention.output', 'molformer.encoder.layer.0.attention.output.dense', 'molformer.encoder.layer.0.attention.output.LayerNorm', 'molformer.encoder.layer.0.attention.output.dropout']
IA3 Target Modules: ['query', 'value', 'dense', 'dense2']
IA3 Feedforward Modules: ['dense', 'dense2']
Starting training for ia3...


Epoch,Training Loss,Validation Loss
1,0.7442,No log
2,0.708,No log
3,0.7414,No log
4,0.7212,No log
5,0.7261,No log
6,0.7326,No log
7,0.7179,No log
8,0.716,No log
9,0.7407,No log
10,0.7557,No log


0,1
epoch,▁
eval/runtime,▄▃▅▁▅▃▂▂▃█▁▃▂▇▂▃▂▂▂▃▂
eval/samples_per_second,▅▆▄█▄▆▇▆▆▁█▅▇▂▇▆▇▆▇▆▇
eval/steps_per_second,▅▆▄█▄▆▇▆▆▁█▅▇▂▇▆▇▆▇▆▇
eval_runtime,▁
eval_samples_per_second,▁
eval_steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█████
train/grad_norm,▅▅▄▄▁▃▂▁▃▂▂▆▁▂▂▂▁▃▇▂▅█▅▁▃▂▂▂▂▄▂▂▄▂▅▃▁▄▂▂

0,1
epoch,20.0
eval/runtime,0.2372
eval/samples_per_second,252.937
eval/steps_per_second,33.725
eval_runtime,0.2372
eval_samples_per_second,252.937
eval_steps_per_second,33.725
total_flos,161432476876800.0
train/epoch,20.0
train/global_step,600.0


Model saved at: /kaggle/working/models/fine_tuned_ia3_20250310_152655
