In [1]:
import os
import sys

# Set environment variables before importing transformers
os.environ["HUGGINGFACE_HUB_CACHE"] = "/scratch/shareddata/dldata/huggingface-hub-cache/hub" # Load local models
os.environ["TRANSFORMERS_OFFLINE"] = "1" 
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["HF_HOME"] = f"{os.environ["WRKDIR"]}/.cache/huggingface" # Cache in work directory

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), ".."))
)  # Add parent directory to path

import argparse
import json
import pandas as pd
import transformers
from datasets import Dataset

from utils.prompts import (
    JUDGE_SYSTEM_PROMPT,
    JUDGE_TEMPLATE,
    GENERATE_EXERCISES_SYSTEM_PROMPT,
    GENERATE_EXERCISES_TEMPLATE_EXPLICIT,
    GENERATE_EXERCISES_TEMPLATE_FEWSHOT,
    GENERATE_EXERCISES_TEMPLATE_IMPLICIT,
    GENERATE_EXERCISES_TEMPLATE_ZEROSHOT,
)

In [2]:
DEFAULT_DATA = r"../../data/complete_dataset.csv"
DEFAULT_MODEL = "google/gemma-3-12b-it"

system_prompts = {
    "judge": JUDGE_SYSTEM_PROMPT,
    "zeroshot": GENERATE_EXERCISES_SYSTEM_PROMPT,
    "fewshot": GENERATE_EXERCISES_SYSTEM_PROMPT,
    "explicit": GENERATE_EXERCISES_SYSTEM_PROMPT,
    "implicit": GENERATE_EXERCISES_SYSTEM_PROMPT,
}

In [3]:
def make_prompt(row, task_type):
    _, problem_description, example_solution, _, _, theme, topic, concept, *_ = row

    match task_type:
        case "judge":
                return (JUDGE_TEMPLATE.replace("$THEME$", theme)
                .replace("$TOPIC$", topic)
                .replace("$CONCEPT$", concept)
                .replace("$TEXT$", problem_description)
                .replace("$CODE$", example_solution)
            )
        case "zeroshot":
            return GENERATE_EXERCISES_TEMPLATE_ZEROSHOT
        case "fewshot":
            return GENERATE_EXERCISES_TEMPLATE_FEWSHOT
        case "explicit":
            return GENERATE_EXERCISES_TEMPLATE_EXPLICIT
        case "implicit":
            return GENERATE_EXERCISES_TEMPLATE_IMPLICIT
        case _:
            raise ValueError(f"Task type '{_}' not recognised as valid task type!")


def run_model(pipe, data, task_type):
    system_prompt = system_prompts.get(task_type, None)

    if system_prompt is None:
        raise ValueError(f"Task type '{task_type}' not recognised as valid task type!")

    response = pipe(
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": data["prompt"]},
        ],
        return_full_text=False,
        max_new_tokens=500,
    )

    result = response[0]["generated_text" ]
    result = result.replace("```json", "").replace("```", "")
    
    result_dict = json.loads(result)

    for k, v in result_dict.items():
        data[k] = v

    return data

In [4]:
task = "judge"

In [5]:
params = {
    "model": DEFAULT_MODEL,
    "device_map": 0,  # Force GPU
    "max_new_tokens": 500,
    "temperature": 0.3,
}

pipeline = transformers.pipeline("text-generation", **params)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
df = pd.read_csv(DEFAULT_DATA, sep=";")
n_rows = 3

eval_df = df
#eval_df = df.loc[0:n_rows - 1]
eval_df["prompt"] = eval_df.apply(lambda row: make_prompt(row, task), axis=1)

In [7]:
columns = df.columns
#print(columns)

eval_df = eval_df[eval_df[columns[11]] == "no"] # The exercise description does not match selected theme

eval_df

Unnamed: 0,title,problemDescription,exampleSolution,starterCode,tests,theme,topic,concept,difficulty,id,The exercise description was clear (Yes/Partially/No),The exercise description matched the selected theme (Yes/Partially/No),The exercise description matched the selected topic (Yes/Partially/No),The exercise description matched the selected concept (Yes/No),Included concepts that were too advanced (Yes/No)\n,The exercise difficulty matched the selected difficulty (Too easy/Okay/Too difficult),Shallow vs deep personalization (Deep/Shallow/Unsure),Open field,prompt
83,Berry Picking!,Write a program that asks the user for their f...,"{'code': ""import 'dart:io';main() { print('Wh...","{'code': ""import 'dart:io';main() {}""}","{'testCode': ""import 'package:test/test.dart';...",outdoor activities,berry picking,user input,normal,368826100000000.0,yes,no,no,yes,no,okay,shallow,,You are evaluating a programming exercise.\n\n...
121,Pass the Parcel!,Write a program that asks the user for their n...,"{'code': ""import 'dart:io';main() { print('Wh...","{'code': ""import 'dart:io';main() {}""}","{'testCode': ""import 'package:test/test.dart';...",party games,Pass the Parcel,user input,advanced,979604400000000.0,yes,no,no,yes,no,too easy,shallow,,You are evaluating a programming exercise.\n\n...
158,Pass the Parcel!,Write a program that asks the user for their n...,"{'code': ""import 'dart:io';main() { print('Wh...","{'code': ""import 'dart:io';main() {}""}","{'testCode': ""import 'package:test/test.dart';...",party games,Pass the Parcel,program output,normal,606832000000000.0,yes,no,yes,yes,no,too difficult,unsure,,You are evaluating a programming exercise.\n\n...
220,Holiday Parade Attendance,Write a program that asks the user if they wil...,"{'code': ""import 'dart:io'; main() { print(...","{'code': ""import 'dart:io'; main() { }""}","{'testCode': ""import 'package:test/test.dart';...",Christmas,attending holiday parades,program output,normal,-83749760000000.0,yes,no,yes,yes,yes,too difficult,unsure,,You are evaluating a programming exercise.\n\n...


In [8]:
result = eval_df.apply(lambda row: run_model(pipeline, row, task), axis=1)
print("Done")

Done


In [9]:
for idx, row in result.iterrows():
    print("Theme: ", row["ThemeCorrect"])
    print("Topic: ", row["TopicCorrect"])
    print("Concept: ", row["ConceptCorrect"])
    print("Reasoning: ", row["Explanation"])
    print("")

Theme:  yes
Topic:  yes
Concept:  yes
Reasoning:  The exercise aligns perfectly with the intended theme (outdoor activities - berry picking) and topic (berry picking). It directly asks the user about their favorite berry, which fits the theme well. The core programming concept being taught is user input – specifically, getting input from the user using `stdin.readLineSync()`. The example solution demonstrates this concept effectively. The exercise is clear, concise, and appropriate for introductory programming students.

Theme:  yes
Topic:  yes
Concept:  yes
Reasoning:  Let's break this down. 

**Step 1: What's the exercise about?** The exercise is about getting information (name and favorite color) from the user and then using that information to create a personalized greeting.

**Step 2: Theme (Party Games):** The theme is 'party games'. While this exercise isn't a full-blown party game, it's a simple interaction that *could* be part of a larger game. Asking for names and preferences