In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_json("defan_t1.0_n20.jsonl", lines=True)
df_orig = pd.read_json("sampled_from_defan_500.jsonl", lines=True)

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 5000)

# fix the domains

domain_lookup = dict(zip(df_orig["id"], df_orig["domain"]))
df["domain_fixed"] = (
    df["id"]
      .str.extract(r"defan_(\d+)_")[0]     # gets "00064"
      .astype(int)                        # 64
      .map(lambda x: f"q_{x:03d}")         # "q_064"
      .map(domain_lookup)
)

# merge illogical with hallucination -- it is not an informative correctness tag

df["correctness_fixed"] = df["correctness"].replace({
    "illogical": "hallucination"
})

# create labels

df["label"] = df["correctness_fixed"].map({
    "correct": 0,
    "hallucination": 1,
    "refused": 1
})

# drop unecessary columns

cols_to_keep = [
    "id",
    "prompt_id",
    "question",
    "reference_answer",
    "model_answer",
    "type",
    "domain_fixed",
    "correctness_fixed",
    "label"
]

df = df[cols_to_keep]

# observed an issue earlier with one of the math questions posed in the dataset, going to do a manual fix

df.loc[
    df["prompt_id"] == "defan_00442_t1.0",
    "reference_answer"
] = "There are no buffaloes in the group, so the answer is zero."

mask = (
    (df["prompt_id"] == "defan_00442_t1.0") &
    (df["id"] != "defan_00442_t1.0_ans09")
)

df.loc[mask, "correctness_fixed"] = "correct"
df.loc[mask, "label"] = 0

df.loc[df["prompt_id"] == "defan_00442_t1.0"]

Unnamed: 0,id,prompt_id,question,reference_answer,model_answer,type,domain_fixed,correctness_fixed,label
8840,defan_00442_t1.0_ans00,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",There is no buffalo in the group.,numeric,math,correct,0
8841,defan_00442_t1.0_ans01,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.","There is no buffalo in this question, it is a group of ducks and cows.",numeric,math,correct,0
8842,defan_00442_t1.0_ans02,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",There is no mention of buffalo so there is no answer.,numeric,math,correct,0
8843,defan_00442_t1.0_ans03,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.","There are no buffaloes in this problem, it only talks about ducks and cows.",numeric,math,correct,0
8844,defan_00442_t1.0_ans04,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",0,numeric,math,correct,0
8845,defan_00442_t1.0_ans05,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",Zero,numeric,math,correct,0
8846,defan_00442_t1.0_ans06,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",There is no mention of any buffaloes in the question.,numeric,math,correct,0
8847,defan_00442_t1.0_ans07,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",0 (there are no buffaloes),numeric,math,correct,0
8848,defan_00442_t1.0_ans08,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",There is no buffalo in the problem statement. The problem is related to ducks and cows.,numeric,math,correct,0
8849,defan_00442_t1.0_ans09,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.","There are no buffaloes in the group, so the answer is zero.",No answer provided as the question doesn't make sense.,numeric,math,refused,1


In [2]:
# examine distribution of "label" across domains
pd.crosstab(df["domain_fixed"], df["label"])

label,0,1
domain_fixed,Unnamed: 1_level_1,Unnamed: 2_level_1
Census Australia,2,1258
Conference venue,190,1050
Nobel Prize,320,940
QS ranking,4,1236
Sports,497,763
World Organizations,234,1006
entertainment,526,734
math,151,1089


In [3]:
# examining questions for which at least 25% of model answers were correct
# change df["domain_fixed"] == "math" to df["domain_fixed"] == "(other domain)" to see other domains
# change 0.25 in good_prompts to anywhere from 0.5-1.00 to see questions that generated at least one-all 20 correct

math_df = df[df["domain_fixed"] == "math"]

rates = (
    math_df
    .groupby("prompt_id")["label"]
    .apply(lambda x: (x == 0).mean())
    .reset_index(name="correct_rate")
)

good_prompts = rates[rates["correct_rate"] >= 0.25]

result = (
    math_df[math_df["prompt_id"].isin(good_prompts["prompt_id"])]
    [["prompt_id", "question"]]
    .drop_duplicates()
    .merge(good_prompts, on="prompt_id")
    .sort_values("correct_rate", ascending=False)
)

result

Unnamed: 0,prompt_id,question,correct_rate
6,defan_00481_t1.0,"there are 7 players in a chess group, and each player plays each of the others once. given that each game is played by two players, how many total games will be played?",1.0
7,defan_00482_t1.0,85 % of the population of a city is 85000. the total population of the city is?,1.0
1,defan_00442_t1.0,"in a group of ducks and cows, the total number of legs are 16 more than twice the no. of heads. find the total no. of buffaloes.",0.95
4,defan_00452_t1.0,"if the remainder is 16 when the integer n is divided by 30, what is the remainder when 2 n is divided by 15?",0.65
10,defan_00497_t1.0,"if n is a positive integer and n ^ 2 is divisible by 72, then the largest positive integer t that must divide n is?",0.6
9,defan_00495_t1.0,the present population of a town is 240. population increase rate is 10 % p. a. find the population of town after 1 years?,0.5
0,defan_00438_t1.0,what least number must be subtracted from 427398 so that remaining number is divisible by 15,0.25
2,defan_00444_t1.0,"seller selling an apple for rs. 19, a seller loses 1 / 6 th of what it costs him. the cp of the apple is?",0.25
3,defan_00448_t1.0,"two family reunions are happening at the park avenue hotel, the oates reunion and the hall reunion. all 100 guests at the hotel attend at least one of the reunions. if 40 people attend the oates reunion and 70 people attend the hall reunion, how many people attend both reunions",0.25
5,defan_00472_t1.0,"in a group of ducks and cows, the total number of legs are 22 more than twice the no. of heads. find the total no. of buffaloes.",0.25


In [4]:
# earlier scan revealed high refusals in QS+CA domains -- basically the model doesnt have info about these domains
# let's check the refusal rate
pd.crosstab(
    df[df["domain_fixed"].isin(["QS ranking", "Census Australia"])]["correctness_fixed"],
    columns="count"
)

col_0,count
correctness_fixed,Unnamed: 1_level_1
correct,6
hallucination,798
refused,1696


In [5]:
# compared to other domains
pd.crosstab(
    df[~df["domain_fixed"].isin(["QS ranking", "Census Australia"])]["correctness_fixed"],
    columns="count"
)

col_0,count
correctness_fixed,Unnamed: 1_level_1
correct,1918
hallucination,5264
refused,318


In [6]:
# this indicates that QS+CA domains are going to muddy our results, so we drop them
df = df[~df["domain_fixed"].isin(["QS ranking", "Census Australia"])]

In [7]:
# this still leaves us with 375 distinct questions
df["prompt_id"].unique()

<ArrowStringArray>
['defan_00000_t1.0', 'defan_00001_t1.0', 'defan_00002_t1.0',
 'defan_00003_t1.0', 'defan_00004_t1.0', 'defan_00005_t1.0',
 'defan_00006_t1.0', 'defan_00007_t1.0', 'defan_00008_t1.0',
 'defan_00009_t1.0',
 ...
 'defan_00490_t1.0', 'defan_00491_t1.0', 'defan_00492_t1.0',
 'defan_00493_t1.0', 'defan_00494_t1.0', 'defan_00495_t1.0',
 'defan_00496_t1.0', 'defan_00497_t1.0', 'defan_00498_t1.0',
 'defan_00499_t1.0']
Length: 375, dtype: str

In [8]:
# this should still be enough data to do reasonable baseline data science...
# let's generate our question-level labels p_hat and y_i and check

q_level = (
    df.groupby("prompt_id")["label"]
      .agg(n_responses="count", p_hat="mean")
      .reset_index()
)

q_level["y_i"] = (q_level["p_hat"] > 0.5).astype(int)

df = df.merge(q_level[["prompt_id", "p_hat", "y_i"]], on="prompt_id", how="left")

In [9]:
# add domains to the question level data
q_domain = (
    df[["prompt_id", "domain_fixed"]]
    .drop_duplicates()
)

q_level = q_level.merge(q_domain, on="prompt_id", how="left")

# examine spread of y_i across domains
pd.crosstab(q_level["domain_fixed"], q_level["y_i"])

# 92 majority-correct vs 283 majority-hallucinated
# note that math and conference venue questions were particularly difficult for the model, but some variance still survives

y_i,0,1
domain_fixed,Unnamed: 1_level_1,Unnamed: 2_level_1
Conference venue,8,54
Nobel Prize,14,49
Sports,27,36
World Organizations,12,50
entertainment,25,38
math,6,56


In [10]:
# now examine distribution of p_hat across domain
import numpy as np

bins = np.linspace(0, 1, 6)  # 0–0.2, 0.2–0.4, ..., 0.8–1.0

q_level["p_hat_bin"] = pd.cut(q_level["p_hat"], bins=bins, include_lowest=True)

pd.crosstab(q_level["domain_fixed"], q_level["p_hat_bin"])

# again we see that most questions fall in the highest bin, however all domains contain questions
# across multiple bins -- there is variation within-domain

p_hat_bin,"(-0.001, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
domain_fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Conference venue,3,3,6,4,46
Nobel Prize,6,3,11,9,34
Sports,21,3,3,5,31
World Organizations,3,3,8,8,40
entertainment,21,3,4,7,28
math,3,2,1,8,48


In [11]:
q_level.groupby("domain_fixed")["p_hat"].mean().sort_values()

domain_fixed
entertainment          0.582540
Sports                 0.605556
Nobel Prize            0.746032
World Organizations    0.811290
Conference venue       0.846774
math                   0.878226
Name: p_hat, dtype: float64

In [12]:
q_level.groupby("domain_fixed")["y_i"].mean().sort_values()

domain_fixed
Sports                 0.571429
entertainment          0.603175
Nobel Prize            0.777778
World Organizations    0.806452
Conference venue       0.870968
math                   0.903226
Name: y_i, dtype: float64

In [13]:
q_level.groupby("domain_fixed")["p_hat"].std().sort_values()

domain_fixed
math                   0.241345
Conference venue       0.259708
World Organizations    0.263398
Nobel Prize            0.294753
entertainment          0.420460
Sports                 0.430782
Name: p_hat, dtype: float64

In [14]:
q_level.groupby("domain_fixed")["y_i"].std().sort_values()

domain_fixed
math                   0.298064
Conference venue       0.337972
World Organizations    0.398304
Nobel Prize            0.419079
entertainment          0.493169
Sports                 0.498847
Name: y_i, dtype: float64

In [15]:
df = df.merge(
    q_level[["prompt_id", "p_hat_bin"]],
    on="prompt_id",
    how="left"
)

In [16]:
# final df scan
df

Unnamed: 0,id,prompt_id,question,reference_answer,model_answer,type,domain_fixed,correctness_fixed,label,p_hat,y_i,p_hat_bin
0,defan_00000_t1.0_ans00,defan_00000_t1.0,What stadium was the 1930 FIFA World Cup final held in?,Estadio Centenario,Estadio Centenario,name,Sports,correct,0,0.15,0,"(-0.001, 0.2]"
1,defan_00000_t1.0_ans01,defan_00000_t1.0,What stadium was the 1930 FIFA World Cup final held in?,Estadio Centenario,Estadio Centenario,name,Sports,correct,0,0.15,0,"(-0.001, 0.2]"
2,defan_00000_t1.0_ans02,defan_00000_t1.0,What stadium was the 1930 FIFA World Cup final held in?,Estadio Centenario,Estadio Centenario,name,Sports,correct,0,0.15,0,"(-0.001, 0.2]"
3,defan_00000_t1.0_ans03,defan_00000_t1.0,What stadium was the 1930 FIFA World Cup final held in?,Estadio Centenario,Estadio Centenario,name,Sports,correct,0,0.15,0,"(-0.001, 0.2]"
4,defan_00000_t1.0_ans04,defan_00000_t1.0,What stadium was the 1930 FIFA World Cup final held in?,Estadio Centenario,Estadio Centenario,name,Sports,correct,0,0.15,0,"(-0.001, 0.2]"
...,...,...,...,...,...,...,...,...,...,...,...,...
7495,defan_00499_t1.0_ans15,defan_00499_t1.0,"working together, printer a and printer b would finish the task in 30 minutes. printer a alone would finish the task in 50 minutes. how many pages does the task contain if printer b prints 3 pages a minute more than printer a?",400,78 pages,numeric,math,hallucination,1,1.00,1,"(0.8, 1.0]"
7496,defan_00499_t1.0_ans16,defan_00499_t1.0,"working together, printer a and printer b would finish the task in 30 minutes. printer a alone would finish the task in 50 minutes. how many pages does the task contain if printer b prints 3 pages a minute more than printer a?",400,500,numeric,math,hallucination,1,1.00,1,"(0.8, 1.0]"
7497,defan_00499_t1.0_ans17,defan_00499_t1.0,"working together, printer a and printer b would finish the task in 30 minutes. printer a alone would finish the task in 50 minutes. how many pages does the task contain if printer b prints 3 pages a minute more than printer a?",400,390 pages (see solution),numeric,math,hallucination,1,1.00,1,"(0.8, 1.0]"
7498,defan_00499_t1.0_ans18,defan_00499_t1.0,"working together, printer a and printer b would finish the task in 30 minutes. printer a alone would finish the task in 50 minutes. how many pages does the task contain if printer b prints 3 pages a minute more than printer a?",400,120,numeric,math,hallucination,1,1.00,1,"(0.8, 1.0]"


In [17]:
# save final df to file for later use
df.to_csv("response_level_clean_v1.csv", index=False)