In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [76]:
%cd /content/drive/MyDrive/cs7643-group-project/notebooks

/content/drive/MyDrive/cs7643-group-project/notebooks


In [77]:
ls

few_shot_context_distillation_mnli.ipynb  ICL_rte.ipynb    vanilla_cola_baseline.ipynb
few_shot_context_distillation_rte.ipynb   [0m[01;34moffload_folder[0m/  [01;34mwandb[0m/
few_shot_context_distillation_rts.ipynb   [01;34mresults[0m/


In [None]:
!pip install -q transformers accelerate bitsandbytes datasets

# Dependency and Config

In [None]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import logging
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np

import time
import pandas as pd

In [None]:
torch.cuda.empty_cache()

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

In [None]:
model_name_config = "opt-125M"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Data and In-Context Prep

In [79]:
task_to_keys = {
    # labels are: 0 (entailment), 1 (contradiction)
    "rte": ("sentence1", "sentence2"),
    "mnli": ("premise", "hypothesis"),
    "mnli-original": ("premise", "hypothesis"),
    "mnli-mismatched": ("premise", "hypothesis"),
    "hans": ("premise", "hypothesis"),

    # labels are: 0 (not_duplicate), 1 (duplicate)
    "qqp": ("question1", "question2"),
    "paws-qqp": ("sentence1", "sentence2"),

    # labels are: 0 (not acceptable), 1 (acceptable)
    "cola": ("sentence", None),
    "cola-ood": ("sentence", None),
}

In [80]:
def setup_logging():

    """
    Setup and Logging Function:
    This function sets up logging configuration to track the evaluation process.
    """

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
    logger = logging.getLogger(__name__)
    return logger

In [81]:
def load_model_and_tokenizer(model_path = model_name_config):

    """
    Model Loading Function:
    - Loads model configuration, tokenizer, and the OPT model
    - Sets up FP16 precision for efficiency
    - Configures padding tokens
    """

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"facebook/{model_name_config}")
    # Load model
    config = AutoConfig.from_pretrained(
        f"facebook/{model_name_config}",
        num_labels=2,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        f"facebook/{model_name_config}",
        config=config
    )
    return model, tokenizer, config

In [82]:
def load_datasets(task_name="rte", eval_task_name="hans"):

    """
    Dataset Loading Function:
    - RTE dataset as in-domain evaluation data
    - HANS dataset as out-of-domain evaluation data
    """

    # Load RTE (in-domain) and HANS (out-of-domain) datasets
    rte_dataset = load_dataset("glue", task_name)
    hans_dataset = load_dataset(eval_task_name)

    return rte_dataset, hans_dataset

In [83]:
def _select_subset_by_ids(dataset, indices):
    subset = dataset.select(indices)
    return subset

In [84]:
def get_balanced_subsets(dataset):
    subset_per_label = {}
    for label_idx, _ in enumerate(dataset.features["label"].names):
        subset_per_label[label_idx] = dataset.filter(
            lambda s: s["label"] == label_idx)
    return subset_per_label

In [85]:
def _select_random_subset(dataset, num_shots, balanced=False, seed=42):
    # fix seed
    np.random.seed(seed)

    if num_shots < 1:
        return [], []

    if balanced:
        assert num_shots % 2 == 0, "a balanced context requires at least one demonstartion per label"
        # select the same number of samples from every label
        indices = []  # we collect all indices here
        subset_per_label = get_balanced_subsets(dataset)

        for _, samples in subset_per_label.items():
            subset_indices = samples["idx"]
            # select num_shots // 2 samples
            subset_indices = np.random.choice(
                subset_indices, size=num_shots // 2, replace=False)
            indices += list(subset_indices)
        assert len(indices) == num_shots
    else:
        # just select a random subset of samples
        indices = np.random.choice(
            range(len(dataset)), size=num_shots, replace=False)

    # return _select_subset_by_ids(dataset, indices), indices
    return _select_subset_by_idx(dataset, indices), indices

In [86]:
def _select_subset_by_idx(dataset, indices):
    dataset = dataset.filter(
        lambda s: s["idx"] in indices)
    return dataset

In [87]:
def create_few_shot_context(
    dataset_name,
    dataset,
    num_shots,
    pattern,
    label_to_tokens,
    separate_shots_by=" ",
    description="",
    target_prefix="",
    from_indices=None,
    balanced=False,
    shuffle=False,
    seed=42
):
    assert pattern is not None
    assert label_to_tokens is not None

    # select samples from which the context will be constructed
    if from_indices is not None:
        demonstrations, indices = _select_subset_by_ids(dataset, from_indices)
    else:
        demonstrations, indices = _select_random_subset(
            dataset, num_shots, balanced, seed)

    if shuffle:
        if len(demonstrations) > 0:
            demonstrations = demonstrations.shuffle(seed)

    # create context
    context = "" if description == "" else f"{description}{separate_shots_by}"

    for sample in demonstrations:
        formated_sample = pattern.format(
            text1=sample[task_to_keys[dataset_name][0]],
            text2=sample[task_to_keys[dataset_name][1]
                         ] if task_to_keys[dataset_name][1] is not None else None
        )
        verbalized_label = label_to_tokens[sample["label"]]
        if verbalized_label.startswith("Ġ"):
            # we need to remove the leading whitespace from the target token in the context
            verbalized_label = verbalized_label[1:]

        elif verbalized_label.startswith("▁"):
            # we need to remove the leading whitespace from the target token in the context
            verbalized_label = verbalized_label[1:]

        context += f"{formated_sample}{target_prefix}{verbalized_label}{separate_shots_by}"

    return context, indices

In [92]:
def preprocess_function(examples, tokenizer, task_name, pattern, max_length, target_prefix):
    """
    Formats inputs in GPT-3 style using a specific pattern
    Tokenizes the formatted inputs
    Adds labels for evaluation
    """

    # Get the appropriate keys based on the task
    if task_name == "rte":
      text1 = examples["sentence1"]
      text2 = examples["sentence2"]
    elif task_name in ["hans", "mnli"]:
      text1 = examples["premise"]
      text2 = examples["hypothesis"]
    else:
      raise ValueError(f"Unsupported task: {task_name}")

    # Format inputs
    formatted_inputs = [
        pattern.format(
            text1=text1[i],
            text2=text2[i]
        )
        for i in range(len(text1))
    ]

    # Tokenize inputs
    tokenized = tokenizer(
        formatted_inputs,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    # Add labels
    if "label" in examples:
        tokenized["labels"] = torch.tensor(examples["label"])

    return tokenized

In [93]:
def evaluate_model(model, dataset, tokenizer, task_name, batch_size, pattern, max_length, target_prefix):
    model.eval()
    all_predictions = []
    all_labels = []

    # Create DataLoader
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False
    )

    with torch.no_grad():
        for batch in dataloader:
            inputs = preprocess_function(batch, tokenizer, task_name, pattern, max_length, target_prefix)
            outputs = model(**inputs)
            predictions = outputs.logits.argmax(dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(inputs["labels"].cpu().numpy())

    # Calculate accuracy
    accuracy = sum(p == l for p, l in zip(all_predictions, all_labels)) / len(all_labels)
    return accuracy

In [95]:
def main():
    logger = setup_logging()

    # Configuration
    batch_size = 32
    pattern = "{text1} question: {text2} Yes or No?"
    target_prefix = " answer: "
    target_tokens = ["Yes", "No"]
    id_to_target_token = {idx: t for idx, t in enumerate(target_tokens)}
    num_shots = [2, 32, 128]  # Number of few-shot examples

    # Load model and tokenizer
    logger.info("Loading model and tokenizer...")
    model, tokenizer, config = load_model_and_tokenizer()

    # Load datasets
    logger.info("Loading datasets...")
    rte_dataset, hans_dataset = load_datasets()

    # Results dictionary
    results = {}

    # Evaluate for each batch size
    for n in num_shots:
        logger.info(f"Evaluating with num of shots {n}")

        # Create few-shot context from RTE training set
        context = create_few_shot_context(
            dataset_name="rte",
            dataset=rte_dataset["train"],
            num_shots=n,
            pattern=pattern,
            label_to_tokens=id_to_target_token,
            target_prefix=target_prefix,
            balanced=True,
            shuffle=True
        )

        # Evaluate on RTE validation set (in-domain): model, dataset, tokenizer, task_name, batch_size, pattern, max_length, target_prefix
        rte_accuracy = evaluate_model(
            model=model,
            dataset=rte_dataset["validation"],
            tokenizer=tokenizer,
            task_name="rte",
            batch_size=batch_size,
            pattern=pattern,
            max_length = 128,
            target_prefix=target_prefix
        )

        # Evaluate on HANS (out-of-domain)
        hans_accuracy = evaluate_model(
            model=model,
            dataset=hans_dataset["validation"],
            tokenizer=tokenizer,
            task_name="hans",
            batch_size=batch_size,
            pattern=pattern,
            max_length = 128,
            target_prefix=target_prefix
        )

        results[n] = {
            "rte_accuracy": rte_accuracy,
            "hans_accuracy": hans_accuracy
        }

        logger.info(f"num of shots {n} results:")
        logger.info(f"RTE accuracy: {rte_accuracy:.4f}")
        logger.info(f"HANS accuracy: {hans_accuracy:.4f}")

    return results

In [None]:
if __name__ == "__main__":
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set random seeds for reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)

    results = main()

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tokenized["labels"] = torch.tensor(examples["label"])


In [None]:
results.to_csv(f"./few_shot_ICL_rte_baseline_results_{model_name_config}.csv")