In [1]:
# goal of these notebooks is to compute the columns of the feature matrix X as outlined in Helmut's document
# we will use the DefAn QA dataset for our questions: https://github.com/ashikiut/defan
# these questions are designed to elicit definitive, concise, and informative answers,
# allowing us to more confidently rely on our LLM judge to determine reponse correctness.

# this first notebook is dedicated to producing a mindfully constructed input questions set.

import pandas as pd

pd.set_option("display.max_colwidth", None)

df = pd.read_json("DefAn/DefAn-Public Combined/DefAn_public_combined.json")
print(df.columns)
print(df.shape)
print(df["domain"].unique())
print(df["type"].unique())
display(df.describe(include='all').T)

Index(['questions', 'answer', 'type', 'domain'], dtype='str')
(70793, 4)
<ArrowStringArray>
[             'Sports',    'Census Australia',         'Nobel Prize',
       'entertainment', 'World Organizations',          'QS ranking',
    'Conference venue',                'math']
Length: 8, dtype: str
<ArrowStringArray>
['name', 'numeric', 'date', 'rank', 'city']
Length: 5, dtype: str


Unnamed: 0,count,unique,top,freq
questions,70793,70145,What is the birthdate (mm/dd/yyyy) of Katharine Hepburn?,6
answer,69713,4857,6,391
type,70793,5,numeric,24278
domain,70793,8,QS ranking,21495


In [2]:
# let's take a quick look inside the data
df.sample(20)

Unnamed: 0,questions,answer,type,domain
44366,What position did Tufts University attain in the QS rankings for 2023?,312,rank,QS ranking
66683,"a bowl of nuts is prepared for a party . brand p mixed nuts are 20 % almonds and brand q ' s deluxe nuts are 25 % almonds . if a bowl contains a total of 65 ounces of nuts , representing a mixture of both brands , and 15 ounces of the mixture are almonds , how many ounces of brand q ' s deluxe mixed nuts are used ?(Give me the numeric answer only)",40,numeric,math
12053,"Who secured the Nobel Prize for Peace in 1946? [first name + last name only] if multiple person, give one name only.",Emily Greene Balch;John Mott,name,Nobel Prize
31421,When was Ghana accepted as a member of the United Nations?,03/08/1957,date,World Organizations
42873,How did Universidad Autónoma de Madrid fare in the QS rankings for 2023?,215,rank,QS ranking
41140,QS ranking of Sungkyunkwan University(SKKU) in 2023?,99,rank,QS ranking
44202,What was Xi’an Jiaotong University's QS rating for 2023?,302,rank,QS ranking
40182,What was University of Toronto's QS rating for 2023?,34,rank,QS ranking
56536,"in a rectangular coordinate system , what is the area of a rectangle whose vertices have the coordinates ( - 7 , 1 ) , ( 1 , 1 ) , ( 1 , - 6 ) and ( - 7 , - 6 ) ?(Give me the numeric answer only)",56,numeric,math
65831,if ( 3 / 2 ) x - 3 = 15 what is the value of x ?(Give me the numeric answer only),12,numeric,math


In [3]:
# important for later: we need to remove certain extraneous instructions which appear in many of the questions in df

import re

phrases = [
    "(Give me the numeric answer only)",
    "Give me the exact location only.",
    "give first name + last name only.",
    "give one name only.",
    "give the movie name only.",
    "[first name + last name only] if multiple person, give one name only.",
    "(Give me the exact number only)",
    "(Give me the name only)",
]

# build a case-insensitive regex that matches any of the phrases literally
pattern = r"(?i)(" + "|".join(re.escape(p) for p in phrases) + r")"

df["questions"] = (
    df["questions"].astype(str)
      .str.replace(pattern, "", regex=True)             # remove phrases
      .str.replace(r"\s+", " ", regex=True)             # collapse whitespace
      .str.replace(r"\s+([?.!,;:])", r"\1", regex=True) # fix space before punctuation
      .str.strip()
)

df.sample(20)

Unnamed: 0,questions,answer,type,domain
62682,"there are, in a certain league, 20 teams, and each team face another team for a total of 4 times. how many games are played in the season?",760,numeric,math
25308,"In 1945, for which movie did Billy Wilder win the Oscar?",The Lost Weekend,name,entertainment
57371,a train 150 m long is running with a speed of 60 km / hr. in what time will it pass a man who is running at 6 km / hr in the direction opposite to that in which the train is going?,8,numeric,math
7430,How many residents aged 80-84 were recorded in Northern Territory in 2021?,1876,numeric,Census Australia
42317,What was the QS ranking of Universitat Autònoma de Barcelona for 2023?,178,rank,QS ranking
3993,"In 2016, what was the number of people aged 45-49 residing in Queensland?",322982,numeric,Census Australia
34350,"What was the position of University of California, Santa Barbara (UCSB) in the QS rankings for 2022?",146,rank,QS ranking
27528,Who was the winner of the Academy Awards/Oscars for best director in 2007?,Joel Coen and Ethan Coen,name,entertainment
55331,What location accommodated ICCV in 2019?,"Seoul, Korea",city,Conference venue
68651,"when 1 / 10 percent of 3,000 is subtracted from 1 / 10 of 3,000, the difference is",397,numeric,math


In [4]:
# lets also check for missing values
for c in df.columns:
    print(df[c].isna().sum())

0
1080
0
0


In [5]:
# any empty strings?
(df["answer"].str.strip() == "").sum()

np.int64(0)

In [6]:
# no empty strings -- let's see where the missing entries come from
missing_mask = df["answer"].isna()
df[missing_mask]["domain"].value_counts()

domain
Nobel Prize    1080
Name: count, dtype: int64

In [7]:
df[missing_mask].sample(20)

Unnamed: 0,questions,answer,type,domain
10362,Who won the Nobel Prize for Economics in 1924?,,name,Nobel Prize
11059,"In 1932, who was the recipient of the Nobel Prize in Peace?",,name,Nobel Prize
10270,Who was honored with the Nobel Prize for Economics in 1923?,,name,Nobel Prize
12250,Who was honored with the Nobel Prize for Economics in 1948?,,name,Nobel Prize
12953,Who secured the Nobel Prize for Peace in 1956?,,name,Nobel Prize
10993,Who was granted the Nobel Prize for Economics in 1931?,,name,Nobel Prize
12162,Who won the Nobel Prize for Economics in 1947?,,name,Nobel Prize
11784,Who garnered the Nobel Prize for Peace in 1943?,,name,Nobel Prize
11621,Who was the Nobel Prize recipient for Economics in the year 1938?,,name,Nobel Prize
10453,Who was granted the Nobel Prize for Economics in 1925?,,name,Nobel Prize


In [8]:
# the issue seems to be that no such Nobel Prizes were awarded for those years
# we'll populate the answers with a string informing our judge LLM as such
df.loc[missing_mask, "answer"] = "No such prize exists."

# now lets check for duplicates
df["questions"].duplicated().sum()

np.int64(648)

In [9]:
# it seems there's quite a few repeated questions. lets take a look

pd.set_option("display.max_rows", None)
duplicates = df[df["questions"].duplicated(keep=False)]
duplicates = duplicates.sort_values("questions")

print(duplicates.shape)
duplicates

(1224, 4)


Unnamed: 0,questions,answer,type,domain
30990,At what date did Congo attain membership in the United Nations?,09/20/1960,date,World Organizations
31110,At what date did Congo attain membership in the United Nations?,09/20/1960,date,World Organizations
30992,At what date did Congo gain entry into the United Nations?,09/20/1960,date,World Organizations
31112,At what date did Congo gain entry into the United Nations?,09/20/1960,date,World Organizations
31000,At what date did Congo receive membership in the United Nations?,09/20/1960,date,World Organizations
31120,At what date did Congo receive membership in the United Nations?,09/20/1960,date,World Organizations
30996,At what date did Congo secure its place in the United Nations?,09/20/1960,date,World Organizations
31116,At what date did Congo secure its place in the United Nations?,09/20/1960,date,World Organizations
30998,At what specific date did Congo enter the United Nations?,09/20/1960,date,World Organizations
31118,At what specific date did Congo enter the United Nations?,09/20/1960,date,World Organizations


In [10]:
# a quick scan shows that some of these duplicated questions even have more than one distinct answer
# e.g., it was lawrence of arabia that won best picture at the oscars in 1962, not mutiny on the bounty...
# let's check to see if this is an issue elsewhere too

# first stop showing all rows, this is dangerous -- df is long
pd.reset_option("display.max_rows")

In [11]:
# now store a copy of the duplicates in memory
dup = df[df["questions"].duplicated(keep=False)].copy()

# clean whitespace so cosmetic differences don't count as different answers
dup["answer"] = (
    dup["answer"]
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
)

answer_counts = dup.groupby("questions")["answer"].nunique()
bad_questions = answer_counts[answer_counts > 1].index

df_bad = dup[dup["questions"].isin(bad_questions)].sort_values(["questions", "answer"])

df_bad

Unnamed: 0,questions,answer,type,domain
24200,"In 1962, which film was honored with the Oscar for best picture?",Lawrence of Arabia,name,entertainment
24215,"In 1962, which film was honored with the Oscar for best picture?",Mutiny on the Bounty,name,entertainment
24205,"In 1962, which movie snagged the Oscar for best picture?",Lawrence of Arabia,name,entertainment
24220,"In 1962, which movie snagged the Oscar for best picture?",Mutiny on the Bounty,name,entertainment
24197,"In 1962, which movie took home the Oscar for best picture?",Lawrence of Arabia,name,entertainment
24212,"In 1962, which movie took home the Oscar for best picture?",Mutiny on the Bounty,name,entertainment
24208,What film was bestowed with the Oscar for best picture in 1962?,Lawrence of Arabia,name,entertainment
24223,What film was bestowed with the Oscar for best picture in 1962?,Mutiny on the Bounty,name,entertainment
24198,What film won the Oscar for best picture in 1962?,Lawrence of Arabia,name,entertainment
24213,What film won the Oscar for best picture in 1962?,Mutiny on the Bounty,name,entertainment


In [12]:
# alright, looks like this was just a fluke. (a check that this doesn't happen among non-dupes either is skipped here but can be found at the end of the notebook)
# let's keep cleaning.
# we should first get rid of all the erroneous 'mutiny on the bounty' rows.


mask = (
    df["answer"].str.contains("Mutiny", case=False, na=False) &
    df["questions"].str.contains("1962", case=False, na=False)
)

df = df[~mask]

df[df["answer"].str.contains("Mutiny", case=False, na=False)]

Unnamed: 0,questions,answer,type,domain
23790,What movie received the Oscar for best picture in 1935?,Mutiny on the Bounty,name,entertainment
23791,Which film was awarded the Oscar for best picture in 1935?,Mutiny on the Bounty,name,entertainment
23792,"In 1935, which movie took home the Oscar for best picture?",Mutiny on the Bounty,name,entertainment
23793,What film won the Oscar for best picture in 1935?,Mutiny on the Bounty,name,entertainment
23794,Which movie claimed the Oscar for best picture in 1935?,Mutiny on the Bounty,name,entertainment
23795,"In 1935, which film was honored with the Oscar for best picture?",Mutiny on the Bounty,name,entertainment
23796,What was the Oscar-winning film for best picture in 1935?,Mutiny on the Bounty,name,entertainment
23797,Which film secured the Oscar for best picture in 1935?,Mutiny on the Bounty,name,entertainment
23798,best picture in 1935 Oscar?,Mutiny on the Bounty,name,entertainment
23799,Which film emerged victorious in the Oscar race for best picture in 1935?,Mutiny on the Bounty,name,entertainment


In [13]:
# now let's get rid of all the actual duplicates
# note: we're not counting questions with the same semantic content but phrased slightly differently as duplicates

# first, make a cleaned key for "same literal question up to whitespace/case"
df["questions_clean"] = (
    df["questions"].astype(str)
                   .str.replace(r"\s+", " ", regex=True)
                   .str.strip()
                   .str.lower()
)

# now drop the duplicates and remove the cleaned key column
df = df.drop_duplicates(subset=["questions_clean"], keep="first")
df = df.drop(columns=["questions_clean"])


In [14]:
print(df["questions"].duplicated().any())

False


In [15]:
# finally, let's sample for the 500 questions we want to use for data generation
# probably an even spread across domains will be good...
# i dont know a good way of disallowing semantic duplicates, so i'll just ignore it and let them be for now

print(df["domain"].value_counts(normalize=True) * 100)

domain
QS ranking             30.643667
math                   21.695060
entertainment          15.386699
Nobel Prize            13.963932
Census Australia       11.269513
World Organizations     3.891938
Sports                  1.844750
Conference venue        1.304441
Name: proportion, dtype: float64


In [16]:
n_total = 500
domains = df["domain"].unique()
k = len(domains)

base = n_total // k
remainder = n_total % k

pieces = []

for i, d in enumerate(domains):
    n = base + (i < remainder)   # first 'remainder' domains get +1
    pieces.append(df[df["domain"] == d].sample(n=n, random_state=0))

sampled = pd.concat(pieces, ignore_index=True)

In [17]:
sampled["domain"].value_counts()  # should be 62/63 each

domain
Sports                 63
Census Australia       63
Nobel Prize            63
entertainment          63
World Organizations    62
QS ranking             62
Conference venue       62
math                   62
Name: count, dtype: int64

In [18]:
# alright, now we just need to export the questions to json

sampled = sampled.reset_index(drop=True)
sampled["id"] = ["q_%03d" % i for i in range(len(sampled))]
sampled.to_json("sampled_from_defan_500.jsonl", orient="records", lines=True)

In [19]:
# let's do a quick check to make sure there's no more contradictory cases like the lawrence-mutiny situation

# clean both columns (so whitespace/case doesn't create fake differences)
df["questions_clean"] = (
    df["questions"].astype(str)
                   .str.replace(r"\s+", " ", regex=True)
                   .str.strip()
                   .str.lower()
)

df["answer_clean"] = (
    df["answer"].astype(str)
                .str.replace(r"\s+", " ", regex=True)
                .str.strip()
                .str.lower()
)

# find questions that map to >1 distinct answer
answer_counts = df.groupby("questions_clean")["answer_clean"].nunique()
bad_questions = answer_counts[answer_counts > 1].index

# inspect all contradictory rows
contradictions = df[df["questions_clean"].isin(bad_questions)] \
    .sort_values(["questions_clean", "answer_clean"])

contradictions

Unnamed: 0,questions,answer,type,domain,questions_clean,answer_clean


In [20]:
# all good!