Define the seed for reproductible results

In [31]:
_SEED = 42

In [32]:
TOP_5_OUR_DATASET_TAGS = "['data structures', 'greedy', 'math', 'implementation', 'dp']"
TOP_10_OUR_DATASET_TAGS = "['greedy', 'dp', 'graphs', 'brute force', 'math', 'constructive algorithms', 'sortings', 'implementation', 'binary search', 'data structures']"
TOP_20_OUR_DATASET_TAGS = "['implementation', 'binary search', 'math', 'number theory', 'greedy', 'graphs', 'data structures', 'geometry', 'sortings', 'dp', 'brute force', 'combinatorics', 'dfs and similar', 'constructive algorithms', 'trees', 'strings', 'two pointers', 'dsu', 'bitmasks', 'divide and conquer']"

TOP_5_OUTSIDE_DATASET_TAGS = "['data structures', 'implementation', 'dp', 'math', 'greedy']"
TOP_10_OUTSIDE_DATASET_TAGS = "['greedy', 'math', 'constructive algorithms', 'implementation', 'brute force', 'data structures', 'dp', 'geometry', 'strings', 'dfs and similar']"
TOP_20_OUTSIDE_DATASET_TAGS = "['implementation', 'trees', 'math', 'greedy', 'binary search', 'geometry', 'two pointers', 'data structures', 'sortings', 'combinatorics', 'dp', 'bitmasks', 'brute force', 'number theory', 'dsu', 'divide and conquer', 'strings', 'graphs', 'dfs and similar', 'shortest paths']"


Read the dataset

In [33]:
############################################
# IMPORTANT: 
# 1. Change the dataset path to the testing dataset you want to use.
# 2. If the dataset is changed, always update the TOP_TAGS to the corresponding TOP_TAGS of the dataset.
# 3. If you want to use the dataset corresponding to Kim et al., use the TOP_TAGS of the outside dataset.
############################################

import pandas as pd


testing_df = pd.read_csv('../../../01_TASK_DATASETS/03_Task_Datasets/05_DATASETS_ENHANCED_WO_TAG_ENCODING/OUR_DATASET/top_20_testing_dataset_2025.csv')

testing_df.head(10)

TOP_TAGS = TOP_20_OUR_DATASET_TAGS

In [34]:
import unicodedata
import re

# mapping of the worst Windows-1252 characters → ASCII
def __clean_to_utf8(text: str) -> str:
    """
    replaces common Win-1252 smart quotes/dashes with ASCII
    removes other non-printable / control chars
    ensures the returned str can be .encode('utf-8') losslessly
    """
    
    _W1252_MAP = {
    "\u2018": "'",   # left single quotation
    "\u2019": "'",   # right single quotation
    "\u201c": '"',   # left double quotation
    "\u201d": '"',   # right double quotation
    "\u2013": "-",   # en dash
    "\u2014": "-",   # em dash
    "\u2026": "...", # ellipsis
    "\u00a0": " ",   # non-breaking space
    "\u2264": "<=",   # less-than or equal to
    "\u2260": "!=",   # not equal
    "\u2265": ">=",   # greater-than or equal to
    "\u2261": "==",   # identical to
    "\u222a": "U",   # union
    "\u2286": "U",   # subset of
    "\u2211": "sum", # summation
    "\u220f": "prod", # product
    "\u2208": "in",   # element of
    "\u2212": "-",   # minus sign
    "\u00b1": "+-",  # plus-minus sign
    "\u22c5": "*",   # dot operator
    "\u221e": "inf", # infinity
    "\u2227": "and",  # logical and
    "\u2228": "or",   # logical or
    "\u2295": "oplus", # circled plus
    "\u2296": "ominus", # circled minus
    "\u2308'": "lceil", # left ceiling
    "\u2309'": "rceil", # right ceiling
    "\u2205": "emptyset", # empty set
    "\u2203": "exists", # there exists
    "\u221a": "sqrt", # square root
    "\u222b": "int",  # integral
    "\u2192": "->",   # right arrow
    "\u2190": "<-",   # left arrow
    "\u03c6": "phi",  # phi
    "\u03c0": "pi",   # pi
    "\u2229": "cap",  # intersection
    "\u230a": "lceil", # left ceiling
    "\u230b": "rceil", # right ceiling
    "\u2219": "*",   # bullet operator
    "\u2206": "delta", # increment
    "\u2207": "nabla", # nabla
    "\u03a3": "Sigma", # summation
    "\u03a0": "Pi",   # product
    "\u22bb": "V",   # right angle
    "\u22c0": "bigcap", # big intersection
    "\u03c3": "sigma", # sigma
    }
    
    if not isinstance(text, str):
        text = str(text)

    # step 1: map known chars
    for bad, good in _W1252_MAP.items():
        text = text.replace(bad, good)

    # step 2: NFKC normalisation (e.g. full-width → ASCII)
    text = unicodedata.normalize("NFKC", text)

    # step 3: strip any remaining control chars except \n, \t
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)

    # optional: collapse multiple spaces
    text = re.sub(r"[ \t]+", " ", text)

    # will raise if still not valid
    _ = text.encode("utf-8")
    return text
    

Define api request. Update the 'model' to desired gpt api

In [None]:
from openai import OpenAI

API_KEY = "API_KEY"

def calculate_relevance(problem_statement, problem_editorial, seed=None):

    CLASSIFICATION_PROMPT = (
        "You are a strict tag classifier.\n"
        "Return ONLY the applicable tags from this fixed list:\n"
        f"{TOP_TAGS}\n"
        """Provide the answer in valid JSON format as follows: "labels": "label1, label2, label3, ...". No other words.\n"""
        "Each input consists of:\n"
        " - A programming-problem statement.\n"
        " - An \"--- Editorial ---\" section that explains the intended solution.\n"
        "Use BOTH statement, and editorial to decide tags.\n"
        "### EXAMPLE 1 \n"
        "Problem: Given an array of n integers, answer q range minimum queries.\n"
        "Editorial: Preprocess with a sparse table in O(n log n), then answer in O(1).\n"
        "Answer: data structures\n"
        "### EXAMPLE 2 \n"
        "Problem: You have N coins and target sum S. Use each coin at most once.\n"
        "Editorial: Build dp[i][j] = reachable sums using first i coins.\n"
        "Answer: dp\n\n"
        "Now classify:"
    )

    request_prompt = (
        f"{CLASSIFICATION_PROMPT}\n"
        f"Problem: {problem_statement.strip()}\n"
        + "\n--- Editorial ---\n"
        f"{__clean_to_utf8(problem_editorial).strip()}\n"
    )

    client = OpenAI(api_key=API_KEY)  # Consider using environment variables for security
    
    response = client.chat.completions.create(
      # model="gpt-4o",
      # model="gpt-4o-mini",
      # model="o1-mini",
      model="o3-mini",
      top_p=1.0,
      messages=[
        {"role": "user", "content": request_prompt}
      ],
      seed=seed,
      n=1,
    )

    return response.choices[0].message.content

Call gpt api for the testing dataset

In [36]:
import pandas as pd

LOWER_BOUND = 0
UPPER_BOUND = 1500

# Initialize results_df as a DataFrame
results_df = pd.DataFrame()

for index, row in testing_df.iloc[LOWER_BOUND:UPPER_BOUND].iterrows():
    
    problem_statement = row['problem_statement']
    problem_editorial = row['problem_editorial']
    
    print("Row index: ", index)
            
    message_content = calculate_relevance(problem_statement, problem_editorial, seed=_SEED)
    
    # Convert the message content to a DataFrame and append it to results_df
    message_df = pd.DataFrame([message_content])
    results_df = pd.concat([results_df, message_df], ignore_index=True)

# Save the final results
results_df.to_csv('o3_mini_results_2025.csv', index=False)

Row index:  0
Row index:  1
Row index:  2
Row index:  3
Row index:  4
Row index:  5
Row index:  6
Row index:  7
Row index:  8
Row index:  9
Row index:  10
Row index:  11
Row index:  12
Row index:  13
Row index:  14
Row index:  15
Row index:  16
Row index:  17
Row index:  18
Row index:  19
Row index:  20
Row index:  21
Row index:  22
Row index:  23
Row index:  24
Row index:  25
Row index:  26
Row index:  27
Row index:  28
Row index:  29
Row index:  30
Row index:  31
Row index:  32
Row index:  33
Row index:  34
Row index:  35
Row index:  36
Row index:  37


In [37]:
import ast

def create_binary_vector(tag_list, unique_tags):
    unique_tags = ast.literal_eval(unique_tags) if isinstance(unique_tags, str) else unique_tags  # Convert string representation of list to actual list
    
    binary_vector = [0]*len(unique_tags)
            
    if 'nan' != str(tag_list):
        tag_list = ast.literal_eval(tag_list) if isinstance(tag_list, str) else tag_list  # Convert string representation of list to actual list
        for tag in tag_list:
            if tag in unique_tags:
                binary_vector[unique_tags.index(tag)] = 1
    
    return binary_vector

In [38]:
import pandas as pd
import re
from sklearn.metrics import f1_score, roc_auc_score

results_df = pd.read_csv('o3_mini_results_2025.csv')

# Function to transform the JSON string into a list of strings
def transform_labels(json_string):
    match = re.findall(r'"labels":\s*"([^"]+)"', json_string)
    if match:
        return match[0].split(', ')
    else:
        return []

results_df['0'] = results_df['0'].apply(transform_labels)

merged_df = pd.DataFrame({
    'truths': testing_df['problem_tags'],
    'predictions': results_df['0']
})

merged_df['truths'] = merged_df['truths'].apply(lambda x: create_binary_vector(x, TOP_TAGS))
merged_df['predictions'] = merged_df['predictions'].apply(lambda x: create_binary_vector(x, TOP_TAGS))

# Calculate F1 macro score
f1_macro = f1_score(merged_df['truths'].tolist(), merged_df['predictions'].tolist(), average='macro')

# Calculate AUROC
auroc = roc_auc_score(merged_df['truths'].tolist(), merged_df['predictions'].tolist(), average='macro')

print(f"F1 Macro Score: {f1_macro}")
print(f"AUROC: {auroc}")

F1 Macro Score: 0.5130450105450105
AUROC: nan


