In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/cs7643-group-project/notebooks

/content/drive/MyDrive/cs7643-group-project/notebooks


In [3]:
ls

few_shot_context_distillation_mnli.ipynb  [0m[01;34moffload_folder[0m/              [01;34mwandb[0m/
few_shot_context_distillation_rte.ipynb   [01;34mresults[0m/
few_shot_context_distillation_rts.ipynb   vanilla_cola_baseline.ipynb


In [4]:
!pip install -q transformers accelerate bitsandbytes datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompati

# Dependency and Config

In [5]:
import torch
import numpy as np
from torch.nn.functional import kl_div, softmax, log_softmax
from torch.optim import AdamW
from datasets import load_dataset, ClassLabel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [6]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

import pandas as pd
from torch.nn import KLDivLoss

In [7]:
torch.cuda.empty_cache()

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

# Data Prep

In [8]:
#Below method is refenced from: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset


In [11]:
##Prepare the inputs with the fixed context
def manipulate_inputs(batch):
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
    # Define the fixed context
    fixed_context = "Given the premise, does the hypothesis hold true? "
    encoding = tokenizer([f'{fixed_context} Premise: {premise} Hypothesis: {hypothesis}'
                          for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])],
                          truncation=True, padding="max_length", max_length=128, return_tensors='pt')
    batch["input_ids"] = encoding["input_ids"].squeeze()
    batch["attention_mask"] = encoding["attention_mask"].squeeze()
    return batch


In [12]:
data = load_dataset("glue", "mnli")
data = binarize_mnli(data, remove_neutral=True)
data = data.map(manipulate_inputs, batched=True)


Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 261802
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 6692
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 6703
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
})

In [13]:
hans_data = load_dataset("hans")
hans_data = hans_data.map(manipulate_inputs, batched=True)


README.md:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

hans.py:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

The repository for hans contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hans.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [16]:
hans_data

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
})

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Training Prep

In [None]:
#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
# Define a KL divergence loss function
def custom_loss(model_probs, original_model_probs):
    return kl_div(model_probs.log_softmax(dim=-1), original_model_probs.softmax(dim=-1), reduction='batchmean')

In [None]:
task_loss = CrossEntropyLoss()

In [None]:
# Config final result outcome
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])

hyperparameter configs

In [None]:
# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1) # paameter for initalization

n_values = [2, 32, 128]  # number of examples for each class

# Train

In [None]:
for n in n_values:
    for run in range(10):  # repeat 10 times for each n

        # Data selection code remains the same: select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])
        train_dataset = data["train"].select(indices)
        print(train_dataset.shape)

        # Initialize both teacher and student models
        teacher_model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        teacher_model.to(device)
        teacher_model.eval()  # Ensure the teacher model does not train

        student_model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        student_model.to(device)


        # Training configuration
        total_steps = (len(train_dataset) // 32) * 40
        training_args = TrainingArguments(
            output_dir="./results",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps),
        )

        # Customize training loop to include KL divergence loss
        class CustomTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False):
              # Getting the output from the model, which includes logits
              outputs = model(**inputs)
              student_logits = outputs.logits  # Correctly accessing the logits

              # Extract labels from the inputs for use in cross entropy loss
              labels = inputs["labels"].to(device)

              # Calculate the teacher model's logits without gradient updates
              with torch.no_grad():
                  teacher_outputs = teacher_model(**inputs)
                  teacher_logits = teacher_outputs.logits

              # Calculate the distillation loss using the custom KL divergence function
              distillation_loss = custom_loss(student_logits, teacher_logits)

              # Calculate the classification loss using cross entropy
              classification_loss = task_loss(student_logits, labels)

              # Combine the losses
              loss = 0.5 * distillation_loss + 0.5 * classification_loss
              return (loss, outputs) if return_outputs else loss

        # Define the trainer with the custom loss function
        trainer = CustomTrainer(
            model=student_model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Training, evaluation, and result collection code remains mostly unchanged
        trainer.train()

        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])
        in_domain_accuracy = eval_results["eval_accuracy"]
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=hans_data["validation"])
        out_of_domain_accuracy = eval_results["eval_accuracy"]
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)
        print(results_df)

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

(4, 6)


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: CustomTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'

In [None]:
!ls

In [None]:
results_df.to_csv("./few_shot_context_distillation_mnli_baseline_results.csv", index=False)

In [None]:
results_df