In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
%cd /content/drive/MyDrive/cs7643-group-project/notebooks

/content/drive/MyDrive/cs7643-group-project/notebooks


In [38]:
ls

few_shot_context_distillation_mnli.ipynb                         [0m[01;34moffload_folder[0m/
few_shot_context_distillation_rte_baseline_results_opt-125m.csv  [01;34mresults[0m/
few_shot_context_distillation_rte.ipynb                          vanilla_cola_baseline.ipynb
few_shot_context_distillation_rts.ipynb                          [01;34mwandb[0m/


In [39]:
!pip install -q transformers accelerate bitsandbytes datasets

# Dependency and Config

In [40]:
import torch
import numpy as np
from torch.nn.functional import kl_div, softmax, log_softmax
from torch.optim import AdamW
from datasets import load_dataset, ClassLabel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import time

In [41]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

import pandas as pd
from torch.nn import KLDivLoss

In [42]:
torch.cuda.empty_cache()

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

In [43]:
model_name_config = "opt-6.7b"

# Data Prep

In [44]:
##Prepare the inputs with the fixed context
def manipulate_inputs_rte(batch, model_name = model_name_config):
    tokenizer = AutoTokenizer.from_pretrained(f"facebook/{model_name}")
    encoding = tokenizer(
        [f'Given the statement "{sentence1}", does it necessarily follow that "{sentence2}" is true?'
         for sentence1, sentence2 in zip(batch["sentence1"], batch["sentence2"])],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors='pt'
    )
    batch["input_ids"] = encoding["input_ids"].squeeze()
    batch["attention_mask"] = encoding["attention_mask"].squeeze()
    return batch

In [45]:
# Prepare the inputs with the fixed context for HANS
def manipulate_inputs_hans(batch, model_name = model_name_config):
    tokenizer = AutoTokenizer.from_pretrained(f"facebook/{model_name}")
    fixed_context = "Given the premise, does the hypothesis hold true? "
    encoding = tokenizer(
        [f'{fixed_context} Premise: {premise} Hypothesis: {hypothesis}'
         for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors='pt'
    )
    batch["input_ids"] = encoding["input_ids"].squeeze()
    batch["attention_mask"] = encoding["attention_mask"].squeeze()
    return batch

In [46]:
data = load_dataset("glue", "rte")
data = data.map(manipulate_inputs_rte, batched=True)


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [47]:
data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 3000
    })
})

In [48]:
hans_data = load_dataset("hans")
hans_data = hans_data.map(manipulate_inputs_hans, batched=True)


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [49]:
hans_data

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
})

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Training Prep

In [51]:
#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [52]:
# Define a KL divergence loss function
def custom_loss(model_probs, original_model_probs):
    return kl_div(model_probs.log_softmax(dim=-1), original_model_probs.softmax(dim=-1), reduction='batchmean')

In [53]:
task_loss = CrossEntropyLoss()

In [54]:
# Config final result outcome
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy", "execution_time"])

hyperparameter configs

In [55]:
# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained(f"facebook/{model_name_config}", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1) # paameter for initalization

n_values = [2, 32, 128]  # number of examples for each class

# Train

In [56]:
for n in n_values:
    for run in range(10):  # repeat 10 times for each n
        start_time = time.time()  # Start timing

        # Data selection code remains the same: select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])
        train_dataset = data["train"].select(indices)
        print(train_dataset.shape)

        # Initialize both teacher and student models
        teacher_model = AutoModelForSequenceClassification.from_pretrained(f"facebook/{model_name_config}", config=config)
        teacher_model.to(device)
        teacher_model.eval()  # Ensure the teacher model does not train

        student_model = AutoModelForSequenceClassification.from_pretrained(f"facebook/{model_name_config}", config=config)
        student_model.to(device)


        # Training configuration
        total_steps = (len(train_dataset) // 32) * 40
        training_args = TrainingArguments(
            output_dir="./results",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps),
        )

        # Customize training loop to include KL divergence loss
        class CustomTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=False):
              # Getting the output from the model, which includes logits
              outputs = model(**inputs)
              student_logits = outputs.logits  # Correctly accessing the logits

              # Extract labels from the inputs for use in cross entropy loss
              labels = inputs["labels"].to(device)

              # Calculate the teacher model's logits without gradient updates
              with torch.no_grad():
                  teacher_outputs = teacher_model(**inputs)
                  teacher_logits = teacher_outputs.logits

              # Calculate the distillation loss using the custom KL divergence function
              distillation_loss = custom_loss(student_logits, teacher_logits)

              # Calculate the classification loss using cross entropy
              classification_loss = task_loss(student_logits, labels)

              # Combine the losses
              loss = 0.5 * distillation_loss + 0.5 * classification_loss
              return (loss, outputs) if return_outputs else loss

        # Define the trainer with the custom loss function
        trainer = CustomTrainer(
            model=student_model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Training, evaluation, and result collection code remains mostly unchanged
        trainer.train()

        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation"])
        in_domain_accuracy = eval_results["eval_accuracy"]
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=hans_data["validation"])
        out_of_domain_accuracy = eval_results["eval_accuracy"]
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        end_time = time.time()  # End timing
        execution_time = end_time - start_time  # Calculate execution time

        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy],
            "execution_time": [execution_time],
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)
        print(results_df)

(4, 6)


pytorch_model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-6.7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 27803 has 14.74 GiB memory in use. Of the allocated memory 14.47 GiB is allocated by PyTorch, and 140.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
!ls

In [None]:
results_df.to_csv(f"./few_shot_context_distillation_rte_baseline_results_{model_name_config}.csv", index=False)

In [None]:
results_df