In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/llm_finetuning/llm_finetuning/notebooks'
curr_filename = "vanilla_rte_baseline"
!pip install -q transformers accelerate bitsandbytes datasets

Mounted at /content/drive
[Errno 2] No such file or directory: '/content/drive/MyDrive/llm_finetuning/llm_finetuning/notebooks'
/content
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the so

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Trainer, TrainingArguments, AdamW
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch
import warnings
warnings.simplefilter("ignore")
import time

# Start the timer
start_time = time.time()

# Set seed for reproducibility
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Load RTE dataset
rte_dataset = load_dataset("glue", "rte")

# Load HANS dataset
hans_data = load_dataset("hans", split="validation")

# Define tokenizer and model
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

# Preprocess RTE data
# def preprocess_rte(examples):
#     return tokenizer(
#         examples["sentence1"],
#         examples["sentence2"],
#         truncation=True,
#         padding="max_length",
#         max_length=128
#     )

# # Preprocess HANS data
# def preprocess_hans(examples):
#     return tokenizer(
#         examples["premise"], examples["hypothesis"],  # Premise and hypothesis
#         truncation=True, padding="max_length", max_length=128
#     )

def manipulate_inputs_rte(batch):
    encoding = tokenizer(
        [f'Given the statement "{sentence1}", does it necessarily follow that "{sentence2}" is true?'
         for sentence1, sentence2 in zip(batch["sentence1"], batch["sentence2"])],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors='pt'
    )
    batch["input_ids"] = encoding["input_ids"].squeeze()
    batch["attention_mask"] = encoding["attention_mask"].squeeze()
    return batch

# Prepare the inputs with the fixed context for HANS
def manipulate_inputs_hans(batch):
    fixed_context = "Given the premise, does the hypothesis hold true? "
    encoding = tokenizer(
        [f'{fixed_context} Premise: {premise} Hypothesis: {hypothesis}'
         for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors='pt'
    )
    batch["input_ids"] = encoding["input_ids"].squeeze()
    batch["attention_mask"] = encoding["attention_mask"].squeeze()
    return batch

rte_dataset = rte_dataset.map(manipulate_inputs_rte, batched=True)
hans_dataset = hans_data.map(manipulate_inputs_hans, batched=True)

# Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Few-shot training
few_shot_sample_size = [2, 32, 128]
num_epochs = 40
batch_size = 32
learning_rate = 1e-5
num_runs = 10

results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])

for run_idx in range(num_runs):
  for n in few_shot_sample_size:

        # Reinitialize the model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
        optimizer = AdamW(model.parameters(), lr=learning_rate)

        # Select n examples per class from RTE training data
        entailment_indices = np.where(np.array(rte_dataset["train"]["label"]) == 1)[0]
        non_entailment_indices = np.where(np.array(rte_dataset["train"]["label"]) == 0)[0]
        entailment_indices = np.random.choice(entailment_indices, n, replace=False)
        non_entailment_indices = np.random.choice(non_entailment_indices, n, replace=False)
        indices = np.concatenate([entailment_indices, non_entailment_indices])

        train_dataset = rte_dataset["train"].select(indices)

        # Training arguments
        training_args = TrainingArguments(
            output_dir="./results",
            overwrite_output_dir=True,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2
        )

        # Trainer setup
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
            optimizers=(optimizer, None),
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain (RTE)
        print(f"Evaluating in-domain performance for n={n}...")
        in_domain_results = trainer.evaluate(eval_dataset=rte_dataset["validation"])
        in_domain_accuracy = in_domain_results["eval_accuracy"]

        # Evaluate out-of-domain (HANS)
        print(f"Evaluating out-of-domain performance for n={n}...")
        out_of_domain_results = trainer.evaluate(eval_dataset=hans_dataset)
        out_of_domain_accuracy = out_of_domain_results["eval_accuracy"]

        print(f"n={n}, run={run_idx}, In-domain Accuracy: {in_domain_accuracy}, Out-of-domain Accuracy: {out_of_domain_accuracy}")

        # Save results
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run_idx],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy],
            "in_domain_loss": [in_domain_results["eval_loss"]],
            "out_of_domain_loss": [out_of_domain_results["eval_loss"]],
            "in_domain_runtime": [in_domain_results["eval_runtime"]],
            "out_of_domain_runtime": [out_of_domain_results["eval_runtime"]],
            "in_domain_samples_per_second": [in_domain_results["eval_samples_per_second"]],
            "out_of_domain_samples_per_second": [out_of_domain_results["eval_samples_per_second"]],
            "in_domain_steps_per_second": [in_domain_results["eval_steps_per_second"]],
            "out_of_domain_steps_per_second": [out_of_domain_results["eval_steps_per_second"]],
        })

        print(new_row)
        results_df = pd.concat([results_df, new_row], ignore_index=True)


end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Total Runtime: {elapsed_time:.4f} seconds")

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/584k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/621k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

hans.py:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

The repository for hans contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hans.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/30000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=0, In-domain Accuracy: 0.5487364620938628, Out-of-domain Accuracy: 0.4941333333333333
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    0            0.548736                0.494133        0.967875   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0             0.70655             1.8819               204.1095   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       147.192                            146.98   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.598                          18.372  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=0, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    0            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9877               205.7661   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       139.358                           145.797   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.608                          18.225  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=0, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    0            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9465               205.7053   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       142.308                            145.84   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.981                           18.23  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=1, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    1            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9824               206.2138   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       139.732                            145.48   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.656                          18.185  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=1, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    1            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501              1.991               206.0302   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       139.123                            145.61   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.579                          18.201  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=1, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    1            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9414               205.9138   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       142.677                           145.692   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.028                          18.212  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=2, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    2            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9653               205.5594   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       140.944                           145.943   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.809                          18.243  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=2, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    2            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             2.0172               205.6787   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       137.317                           145.859   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                       17.35                          18.232  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=2, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    2            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9762               206.8927   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       140.168                           145.003   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.711                          18.125  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=3, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    3            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9774               206.7062   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       140.081                           145.134   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                        17.7                          18.142  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=3, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    3            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501              1.938               206.0606   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       142.927                           145.588   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.059                          18.199  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=3, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    3            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9347               206.0599   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       143.173                           145.589   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                       18.09                          18.199  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=4, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    4            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9885               206.4711   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       139.302                           145.299   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.601                          18.162  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=4, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    4            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9287               205.9203   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       143.617                           145.687   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.147                          18.211  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=4, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    4            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9988               205.1713   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       138.585                           146.219   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.511                          18.277  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=5, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    5            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9627               204.7563   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       141.133                           146.516   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.833                          18.314  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=5, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    5            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9203               205.1938   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       144.247                           146.203   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.226                          18.275  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=5, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    5            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9478               205.2858   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                        142.21                           146.138   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.969                          18.267  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=6, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    6            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9179               203.5736   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                        144.43                           147.367   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.249                          18.421  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=6, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    6            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9609                207.585   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       141.259                           144.519   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.849                          18.065  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=6, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    6            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9668               206.7762   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       140.841                           145.084   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.796                          18.136  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=7, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    7            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9756               207.6726   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       140.208                           144.458   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.716                          18.057  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=7, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    7            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9853               206.6419   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       139.522                           145.179   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.629                          18.147  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=7, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    7            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             2.0254                208.023   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       136.761                           144.215   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                       17.28                          18.027  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=8, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    8            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354              1.993                206.769   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       138.986                           145.089   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.561                          18.136  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=8, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    8            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9763               206.7898   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       140.159                           145.075   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                       17.71                          18.134  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=8, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    8            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9497               206.8188   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       142.072                           145.054   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.951                          18.132  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


Evaluating out-of-domain performance for n=2...
n=2, run=9, In-domain Accuracy: 0.4981949458483754, Out-of-domain Accuracy: 0.49993333333333334
   n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  2    9            0.498195                0.499933        1.467148   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.769354             1.9942               207.3209   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       138.903                           144.703   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.551                          18.088  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


Evaluating out-of-domain performance for n=32...
n=32, run=9, In-domain Accuracy: 0.5270758122743683, Out-of-domain Accuracy: 0.49883333333333335
    n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  32    9            0.527076                0.498833        2.322063   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            0.766501             1.9582                207.213   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       141.459                           144.779   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      17.874                          18.097  


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


Evaluating out-of-domain performance for n=128...
n=128, run=9, In-domain Accuracy: 0.5992779783393501, Out-of-domain Accuracy: 0.5
     n  run  in_domain_accuracy  out_of_domain_accuracy  in_domain_loss  \
0  128    9            0.599278                     0.5        3.266377   

   out_of_domain_loss  in_domain_runtime  out_of_domain_runtime  \
0            1.028545             1.9293               207.1046   

   in_domain_samples_per_second  out_of_domain_samples_per_second  \
0                       143.572                           144.854   

   in_domain_steps_per_second  out_of_domain_steps_per_second  
0                      18.141                          18.107  
Total Runtime: 9410.7152 seconds


In [3]:
print(results_df.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>n</th>
      <th>run</th>
      <th>in_domain_accuracy</th>
      <th>out_of_domain_accuracy</th>
      <th>in_domain_loss</th>
      <th>out_of_domain_loss</th>
      <th>in_domain_runtime</th>
      <th>out_of_domain_runtime</th>
      <th>in_domain_samples_per_second</th>
      <th>out_of_domain_samples_per_second</th>
      <th>in_domain_steps_per_second</th>
      <th>out_of_domain_steps_per_second</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>2</td>
      <td>0</td>
      <td>0.548736</td>
      <td>0.494133</td>
      <td>0.967875</td>
      <td>0.706550</td>
      <td>1.8819</td>
      <td>204.1095</td>
      <td>147.192</td>
      <td>146.980</td>
      <td>18.598</td>
      <td>18.372</td>
    </tr>
    <tr>
      <th>1</th>
      <td>32</td>
      <td>0</td>
      <td>0.527076</td>
      <td>0.498833</td>
      <td>2.322063</td>
      <td>0.7

In [None]:
curr_filename = 'valina_rte_baseline'
%cd '/content/drive/MyDrive/llm_finetuning/llm_finetuning/notebooks'
# Save results to a CSV file
results_df.to_csv(curr_filename + ".csv", index=False)