<a href="https://colab.research.google.com/github/Duncanswilson/llm-finetuning/blob/main/hellaswag_structured_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import numpy as np

print(torch.cuda.is_available())

# Load model and tokenizer
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device('cuda:0')
model.to(device)

# Load HellaSwag validation dataset
dataset = load_dataset("hellaswag", split="validation[:1%]")

def compute_log_likelihood(model, tokenizer, context, ending):
    input_text = context + " " + ending
    inputs = tokenizer(input_text, return_tensors="pt")
    with torch.no_grad():
        inputs.to(device)
        outputs = model(**inputs)

    # Get logits and input IDs
    logits = outputs.logits
    input_ids = inputs["input_ids"]

    # Shift logits and input IDs for next-token prediction
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()

    # Compute log-likelihood
    loss_fct = torch.nn.CrossEntropyLoss(reduction='sum')
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    # Normalize by sequence length
    log_likelihood = -loss.item() / (input_ids.size(1) - 1)

    return log_likelihood

correct_predictions = 0
total_samples = len(dataset)
print(total_samples)
for sample in dataset:
    context = sample["ctx"]
    endings = sample["endings"]

    log_likelihoods = [compute_log_likelihood(model, tokenizer, context, ending) for ending in endings]
    predicted_index = np.argmax(log_likelihoods)
    correct_index = sample["label"]
    if predicted_index == int(correct_index):
        correct_predictions += 1

accuracy = correct_predictions / total_samples
print(f"Accuracy: {accuracy:.2f}")

True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


100
Accuracy: 0.28


In [4]:
!pip install outlines

Collecting outlines
  Downloading outlines-0.0.46-py3-none-any.whl (101 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.9/101.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting interegular (from outlines)
  Downloading interegular-0.3.3-py37-none-any.whl (23 kB)
Collecting lark (from outlines)
  Downloading lark-1.1.9-py3-none-any.whl (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting diskcache (from outlines)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting pycountry (from outlines)
  Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 M

In [2]:
from datasets import load_dataset
import outlines
import torch

# Load HellaSwag validation dataset
dataset = load_dataset("hellaswag", split="validation[:1%]")

# Create an Outlines generator
model = outlines.models.transformers("openai-community/gpt2")

def create_prompt(context, endings):
    prompt = f"""Given the following context and possible endings, choose the most likely continuation:

Context: {context}

Possible endings:
0. {endings[0]}
1. {endings[1]}
2. {endings[2]}
3. {endings[3]}

Choose the number of the most likely ending:"""
    return prompt

def get_model_choice(model, prompt):
    generator = outlines.generate.choice(model, ["0", "1", "2", "3"])
    answer = generator(prompt)
    return int(answer)

correct_predictions = 0
total_samples = len(dataset)
print(total_samples)

for sample in dataset:
    context = sample["ctx"]
    endings = sample["endings"]

    prompt = create_prompt(context, endings)
    predicted_index = get_model_choice(model, prompt)
    correct_index = sample["label"]

    if predicted_index == int(correct_index):
        correct_predictions += 1

accuracy = correct_predictions / total_samples
print(f"Accuracy: {accuracy:.2f}")

100
Accuracy: 0.25
