In [None]:
import pandas as pd

In [2]:
import argparse
import os
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score

from transformers import AutoTokenizer, AutoModelForCausalLM

# Assume these constants are defined somewhere in your project configuration.
# They are repeated here for clarity.
TRAIN_TEST_DATA_FILE_NAMES = {
    "train": "codeforces_train_data.csv",
    "test": "codeforces_test_data.csv",
    "zero_shot": "codeforces_zero_shot_data.csv"
}
# Make sure this path is set to the proper directory.
TRAIN_SPLIT_DATA_DIR = ""


# The features used to build prompts.
TEXT_FEATURE_COLUMNS = ['problem_description', 'input_specification', 'output_specification', 'problem_notes']
CODE_FEATURE_COLUMNS = ['solution_code']

# The tags of interest.
SELECTED_TAGS = [
    'math',
    'graphs',
    'strings',
    'number theory',
    'trees',
    'geometry',
    'games',
    'probabilities'
]



DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_path = os.path.join(TRAIN_SPLIT_DATA_DIR, TRAIN_TEST_DATA_FILE_NAMES["train"])
test_path = os.path.join(TRAIN_SPLIT_DATA_DIR, TRAIN_TEST_DATA_FILE_NAMES["test"])
zero_shot_path = os.path.join(TRAIN_SPLIT_DATA_DIR, TRAIN_TEST_DATA_FILE_NAMES["zero_shot"])

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

model.to(DEVICE)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant and a highly capable problem analyzer."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [3]:
DEVICE

device(type='cpu')

In [None]:
import pandas as pd

def generate_prompt(row: pd.Series) -> str:
    """
    Given a DataFrame row with the problem details, this function generates a prompt
    for Qwen2.5-7B-Instruct to predict applicable programming problem tags in a strict JSON format,
    wrapped between <output> and </output>.
    """
    string_formatted_tags = ', '.join([f'"{tag}"' for tag in SELECTED_TAGS])
    prompt_parts = [
        "You are given a programming problem with various details. Your task is to analyze the problem and determine which tags apply from the following list:",
        f"{', '.join(SELECTED_TAGS)}.",
        "",
        "For each tag, output 1 if the tag applies to the problem or 0 if it does not.",
        "It is important that you consider that none of the tags might apply, and that multiple tags may apply simultaneously.",
        "IMPORTANT: Your output must be a single valid JSON object, and it must be wrapped between <output> and </output>.",
        f"The JSON object must only contain the following keys: {string_formatted_tags}.",
        "Each key must have either the value 0 or 1. Do not include any additional text or formatting inside the <output> tags.",
        "",
        "Below are the problem details:"
    ]

    # Add text feature columns to prompt.
    for column in TEXT_FEATURE_COLUMNS:
        if pd.notna(row.get(column)):
            feature_name = column.replace('_', ' ').capitalize()
            prompt_parts.append(f"{feature_name}: {row[column]}")

    # Add code feature columns to prompt.
    for column in CODE_FEATURE_COLUMNS:
        if pd.notna(row.get(column)):
            feature_name = column.replace('_', ' ').capitalize()
            prompt_parts.append(f"{feature_name}: {row[column]}")

    prompt_parts.append("")
    prompt_parts.append("Respond with:")
    prompt_parts.append("<output>{\"math\": 1, \"graphs\": 0, \"strings\": 0, \"number theory\": 1, \"trees\": 0, \"geometry\": 0, \"games\": 0, \"probabilities\": 1}</output>")

    return "\n".join(prompt_parts)

# Example usage:
# Suppose you have a DataFrame `df` and you want to add a new column `prompt` for each row.
def add_prompt_column(df: pd.DataFrame) -> pd.DataFrame:
    df["prompt"] = df.apply(generate_prompt, axis=1)
    return df




In [None]:


class ZeroShotDataset(Dataset):
    """
    PyTorch Dataset that returns prompt and ground-truth tags for each example.
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        prompt = generate_prompt(row)
        # Ground-truth tag values as a numpy array of integers.
        y_true = np.array([int(row.get(tag, 0)) for tag in SELECTED_TAGS])
        return prompt, y_true

def parse_model_output(output_text: str) -> np.ndarray:
    """
    Parses the model output to extract the JSON object enclosed between <output> and </output>.
    Returns a numpy array of predicted tag values, defaulting to 0s on failure.
    """
    try:
        # Extract content between <output> and </output>
        match = re.search(r"<output>\s*(\{.*?\})\s*</output>", output_text, re.DOTALL)
        if match:
            json_text = match.group(1)
            json_data = json.loads(json_text)
            predictions = [int(json_data.get(tag, 0)) for tag in SELECTED_TAGS]
        else:
            raise ValueError("No <output>...</output> found.")
    except Exception as e:
        print(f"Error parsing model output: {e}")
        predictions = [0] * len(SELECTED_TAGS)
    return np.array(predictions)

def evaluate_predictions(y_true_all, y_pred_all):
    """
    Compute precision, recall, and F1 for each tag and macro average F1.
    Also reports:
      - number of problems with perfect tag prediction
      - number with at least one correct tag
      - number with no incorrect tags and at least one correct one
    """
    n_tags = y_true_all.shape[1]
    per_tag_metrics = {}

    f1_scores = []
    for i, tag in enumerate(SELECTED_TAGS):
        y_true = y_true_all[:, i]
        y_pred = y_pred_all[:, i]
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        per_tag_metrics[tag] = {
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }
        f1_scores.append(f1)

    macro_f1 = np.mean(f1_scores) if f1_scores else 0.0

    # Custom metrics
    exact_match_count = 0
    at_least_one_correct = 0
    no_wrong_tags_and_one_correct = 0

    for y_true, y_pred in zip(y_true_all, y_pred_all):
        if np.array_equal(y_true, y_pred):
            exact_match_count += 1
        if np.any((y_true == 1) & (y_pred == 1)):
            at_least_one_correct += 1
        if np.all((y_pred == 0) | (y_true == 1)) and np.any((y_true == 1) & (y_pred == 1)):
            no_wrong_tags_and_one_correct += 1

    extra_stats = {
        "perfect_match_count": exact_match_count,
        "at_least_one_correct": at_least_one_correct,
        "no_wrong_and_some_correct": no_wrong_tags_and_one_correct
    }

    return per_tag_metrics, macro_f1, extra_stats


def generate_prediction(prompt: str) -> str:
    """
    Given a prompt, tokenizes it, generates a response from the model, and decodes it.
    """
    # You can adjust max_new_tokens as needed.
    model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=256,
        do_sample=False  # deterministic; use True for sampling/randomness
    )
    # Remove the prompt portion from generated ids, if needed.
    # Here we assume the output includes only the generated tokens.
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return response

def main():
    # Read the CSV containing the complete (zero-shot) data.
    df = pd.read_csv(zero_shot_path)

    # Create the dataset and dataloader.
    dataset = ZeroShotDataset(df)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)  # process one by one for inference

    all_true = []
    all_pred = []

    print("Starting zero-shot tag prediction...")
    for i, (prompt, y_true) in enumerate(dataloader):
        prompt = prompt[0]  # unpack from list (batch_size=1)
        y_true = y_true.numpy()[0]
        # Generate model output.
        output_text = generate_prediction(prompt)
        # Parse model output to obtain predicted tags.
        y_pred = parse_model_output(output_text)
        all_true.append(y_true)
        all_pred.append(y_pred)

        # Optional: print a few examples.
        if i < 3:
            print(f"Example {i + 1}:")
            print("Prompt:", prompt[:200] + "...")
            print("Model output:", output_text)
            print("Parsed prediction:", y_pred)
            print("Ground truth:", y_true)
            print("---")

    all_true = np.array(all_true)
    all_pred = np.array(all_pred)

    # Evaluate predictions
    per_tag_metrics, macro_f1, extra_stats = evaluate_predictions(all_true, all_pred)
    print("\nEvaluation Metrics:")
    for tag, metrics in per_tag_metrics.items():
        print(f"Tag: {tag}")
        print(f"  Precision: {metrics['precision']:.3f}")
        print(f"  Recall:    {metrics['recall']:.3f}")
        print(f"  F1 Score:  {metrics['f1_score']:.3f}")
    print(f"\nMacro F1 Score: {macro_f1:.3f}")
    print("\nCustom Evaluation Stats:")
    print(f"Perfectly matched problems: {extra_stats['perfect_match_count']}")
    print(f"Problems with at least one correct tag: {extra_stats['at_least_one_correct']}")
    print(f"Problems with no wrong tags and at least one correct: {extra_stats['no_wrong_and_some_correct']}")


main()
