In [1]:
import os
import sys
from collections import defaultdict

# Set environment variables before importing transformers
os.environ["HUGGINGFACE_HUB_CACHE"] = "/scratch/shareddata/dldata/huggingface-hub-cache/hub" # Load local models
os.environ["TRANSFORMERS_OFFLINE"] = "1" 
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["HF_HOME"] = f"{os.environ["WRKDIR"]}/.cache/huggingface" # Cache in work directory

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), ".."))
)  # Add parent directory to path

from utils.prompts import JUDGE_SYSTEM_PROMPT, JUDGE_TEMPLATE
from utils.helpers import parse_output

import pandas as pd
import transformers
from datasets import Dataset, load_dataset
import torch

In [2]:
DEFAULT_DATA = r"../../data/complete_dataset.csv"
DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

In [26]:
def make_prompt(row, task_type):  
    match task_type:
        case "judge":
                return (JUDGE_TEMPLATE.replace("$THEME$", row["theme"])
                .replace("$TOPIC$", row["topic"])
                .replace("$CONCEPT$", row["concept"])
                .replace("$TEXT$", row["problemDescription"])
                .replace("$CODE$", row["exampleSolution"])
            )
        case _:
            raise ValueError(f"Task type '{_}' not recognised as valid task type!")

In [24]:
task = "judge"

In [28]:
dataset = load_dataset("csv", data_files=DEFAULT_DATA, split="train", sep=";")
dataset = dataset.map(lambda row: {"prompt": make_prompt(row, task)})
dataset = dataset.select(range(2))

dataset["prompt"]

Column(['You are evaluating a programming exercise.\n\nIntended theme: literature\nIntended topic: Agatha Christie\nIntended programming concept: conditional statements\n\nExercise description:\nAgatha Christie, the famous novelist, has a rating scale for her novels. The ratings are represented as numbers and are accompanied by the following textual descriptions:\n<table>\n<tr>\n<th>Rating</th>\n<th>Description</th>\n</tr>\n<tr>\n<th>5</th>\n<th>Masterpiece</th>\n</tr>\n<tr>\n<th>4</th>\n<th>Excellent</th>\n</tr>\n<tr>\n<th>3</th>\n<th>Good</th>\n</tr>\n<tr>\n<th>2</th>\n<th>Fair</th>\n</tr>\n<tr>\n<th>1</th>\n<th>Below Average</th>\n</tr>\n</table>\nWrite a program that asks the user for a number and prints the textual description related to that number. If the user enters any other number, the program should print the message <code>Invalid rating!</code>.\n\nBelow is an example of the expected operation of the program.\n\n<pre>\nWhat rating?\n<b>&lt; 3</b>\nGood\n</pre>\n\nAnother ex

In [6]:
params = {
    "model": DEFAULT_MODEL,
    "device_map": 0,  # Force GPU
    "max_new_tokens": 500,
    "temperature": 0.3,
    "dtype": torch.bfloat16
}

#pipe = transformers.pipeline("text-generation", **params)
pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

NameError: name 'pipe' is not defined

In [None]:
system_prompt = JUDGE_SYSTEM_PROMPT
outputs = pipe(
    [
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": x},
        ] for x in dataset["prompt"]
    ],
    batch_size=8,
    return_full_text=False,
    do_sample=False,
)

In [None]:
results = defaultdict(list)

for out in outputs:
    text = out[0]["generated_text"]
    parsed = parse_output(text)
    for k, v in parsed.items():
        results[k].append(v)

for k, v in results.items():
    dataset = dataset.add_column(k, v)