In [1]:
import os
import sys
from collections import defaultdict

# Set environment variables before importing transformers
os.environ["HUGGINGFACE_HUB_CACHE"] = "/scratch/shareddata/dldata/huggingface-hub-cache/hub" # Load local models
os.environ["TRANSFORMERS_OFFLINE"] = "1" 
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["HF_HOME"] = f"{os.environ["WRKDIR"]}/.cache/huggingface" # Cache in work directory

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), ".."))
)  # Add parent directory to path

from utils.prompts import JUDGE_SYSTEM_PROMPT, JUDGE_TEMPLATE
from utils.helpers import parse_output

import pandas as pd
import transformers
from datasets import Dataset, load_dataset
import torch

In [2]:
DEFAULT_DATA = r"../../data/complete_dataset.csv"
DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

In [3]:
def make_prompt(row, task_type):
    _, problem_description, example_solution, _, _, theme, topic, concept, *_ = row

    match task_type:
        case "judge":
                return (JUDGE_TEMPLATE.replace("$THEME$", theme)
                .replace("$TOPIC$", topic)
                .replace("$CONCEPT$", concept)
                .replace("$TEXT$", problem_description)
                .replace("$CODE$", example_solution)
            )
        case _:
            raise ValueError(f"Task type '{_}' not recognised as valid task type!")

In [4]:
task = "judge"

In [5]:
dataset = load_dataset("csv", data_files=DEFAULT_DATA, split="train", sep=";")
dataset = dataset.map(lambda row: {"prompt": make_prompt(row, task)})
dataset = dataset.select(range(2))

In [6]:
params = {
    "model": DEFAULT_MODEL,
    "device_map": 0,  # Force GPU
    "max_new_tokens": 500,
    "temperature": 0.3,
    "dtype": torch.bfloat16
}

pipe = transformers.pipeline("text-generation", **params)
pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [7]:
system_prompt = JUDGE_SYSTEM_PROMPT
outputs = pipe(
    [
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": x},
        ] for x in dataset["prompt"]
    ],
    batch_size=8,
    return_full_text=False,
    do_sample=False,
)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [16]:
results = defaultdict(list)

for out in outputs:
    text = out[0]["generated_text"]
    parsed = parse_output(text)
    for k, v in parsed.items():
        results[k].append(v)

for k, v in results.items():
    dataset = dataset.add_column(k, v)