In [1]:
import os
import sys

# Set environment variables before importing transformers
os.environ["HUGGINGFACE_HUB_CACHE"] = "/scratch/shareddata/dldata/huggingface-hub-cache/hub" # Load local models
os.environ["TRANSFORMERS_OFFLINE"] = "1" 
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["HF_HOME"] = f"{os.environ["WRKDIR"]}/.cache/huggingface" # Cache in work directory

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), ".."))
)  # Add parent directory to path

import argparse
import json
import pandas as pd
import transformers
from datasets import Dataset

from utils.prompts import (
    JUDGE_SYSTEM_PROMPT,
    JUDGE_TEMPLATE,
    GENERATE_EXERCISES_SYSTEM_PROMPT,
    GENERATE_EXERCISES_TEMPLATE_EXPLICIT,
    GENERATE_EXERCISES_TEMPLATE_FEWSHOT,
    GENERATE_EXERCISES_TEMPLATE_IMPLICIT,
    GENERATE_EXERCISES_TEMPLATE_ZEROSHOT,
)

In [2]:
DEFAULT_DATA = r"../../data/complete_dataset.csv"
DEFAULT_MODEL = "Qwen/Qwen2.5-14B-Instruct"
DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

system_prompts = {
    "judge": JUDGE_SYSTEM_PROMPT,
    "zeroshot": GENERATE_EXERCISES_SYSTEM_PROMPT,
    "fewshot": GENERATE_EXERCISES_SYSTEM_PROMPT,
    "explicit": GENERATE_EXERCISES_SYSTEM_PROMPT,
    "implicit": GENERATE_EXERCISES_SYSTEM_PROMPT,
}

In [3]:
def make_prompt(row, task_type):
    _, problem_description, example_solution, _, _, theme, topic, concept, *_ = row

    match task_type:
        case "judge":
                return (JUDGE_TEMPLATE.replace("$THEME$", theme)
                .replace("$TOPIC$", topic)
                .replace("$CONCEPT$", concept)
                .replace("$TEXT$", problem_description)
                .replace("$CODE$", example_solution)
            )
        case "zeroshot":
            return GENERATE_EXERCISES_TEMPLATE_ZEROSHOT
        case "fewshot":
            return GENERATE_EXERCISES_TEMPLATE_FEWSHOT
        case "explicit":
            return GENERATE_EXERCISES_TEMPLATE_EXPLICIT
        case "implicit":
            return GENERATE_EXERCISES_TEMPLATE_IMPLICIT
        case _:
            raise ValueError(f"Task type '{_}' not recognised as valid task type!")


def run_model(pipe, data, task_type):
    system_prompt = system_prompts.get(task_type, None)

    if system_prompt is None:
        raise ValueError(f"Task type '{task_type}' not recognised as valid task type!")

    response = pipe(
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": data["prompt"]},
        ],
        return_full_text=False,
        max_new_tokens=500,
    )

    result = response[0]["generated_text"]
    result_dict = json.loads(result)

    for k, v in result_dict.items():
        data[k] = v

    return data

In [4]:
task = "judge"

In [5]:
params = {
    "model": DEFAULT_MODEL,
    "device_map": 0,  # Force GPU
    "max_new_tokens": 500,
    "temperature": 0.3,
}

pipeline = transformers.pipeline("text-generation", **params)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
df = pd.read_csv(DEFAULT_DATA, sep=";")
n_rows = 3

eval_df = df.loc[0:n_rows - 1]
eval_df["prompt"] = eval_df.apply(lambda row: make_prompt(row, task), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df["prompt"] = eval_df.apply(lambda row: make_prompt(row, task), axis=1)


In [7]:
result = eval_df.apply(lambda row: run_model(pipeline, row, task), axis=1)
print("Done")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Done


In [9]:
for idx, row in result.iterrows():
    print("Evaluation: ", row["Correct"])
    print("Reasoning: ", row["Explanation"])
    print("")

Evaluation:  yes
Reasoning:  The exercise is about creating a program that takes a user input, a novel rating, and prints the corresponding textual description. It aligns with the theme (literature), topic (Agatha Christie), and programming concept (conditional statements). The provided solution correctly implements the required functionality.

Evaluation:  yes
Reasoning:  The exercise is well-structured and aligns with the intended theme (handicrafts), topic (painting), and programming concept (conditional statements). The provided solution correctly asks for user input, parses it as an integer, and uses if-else statements to print the corresponding description based on the input. The only improvement could be adding more error handling for non-numeric input.

Evaluation:  yes
Reasoning:  This exercise is about writing a simple Dart program that asks for user input and prints a message based on that input. The program aligns with the theme (food) and topic (Lingonberry sauce). The pro