In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import os

In [2]:
GOLD_PATH   = "../datasets/exploration_datasets/gold/"

In [3]:
df = pd.read_excel("../datasets/exploration_datasets/gold/generated_qa_pairs.xlsx")

In [4]:
df.head()

Unnamed: 0,question,answer,is_impossible,context,title,category,country,newspaper,entities
0,¿Qué enfermedad padecía Gabriel García Márquez?,cáncer linfático,False,Se cumplen 10 años desde que Gabo falleció. El...,¿Qué enfermedad sufrió Gabriel García Márquez ...,cultura,Colombia,Semana,"['Gabo', 'mundo letras', 'Gabriel García Márqu..."
1,¿Dónde nació Gabriel García Márquez?,"Aracataca, Colombia",False,Se cumplen 10 años desde que Gabo falleció. El...,¿Qué enfermedad sufrió Gabriel García Márquez ...,cultura,Colombia,Semana,"['Gabo', 'mundo letras', 'Gabriel García Márqu..."
2,¿Qué obra maestra publicó García Márquez en 1967?,Cien años de soledad,False,Se cumplen 10 años desde que Gabo falleció. El...,¿Qué enfermedad sufrió Gabriel García Márquez ...,cultura,Colombia,Semana,"['Gabo', 'mundo letras', 'Gabriel García Márqu..."
3,¿Qué premio recibió García Márquez en 1982?,Premio Nobel de Literatura,False,Se cumplen 10 años desde que Gabo falleció. El...,¿Qué enfermedad sufrió Gabriel García Márquez ...,cultura,Colombia,Semana,"['Gabo', 'mundo letras', 'Gabriel García Márqu..."
4,¿Cuántos años luchó García Márquez contra su e...,15 años,False,Se cumplen 10 años desde que Gabo falleció. El...,¿Qué enfermedad sufrió Gabriel García Márquez ...,cultura,Colombia,Semana,"['Gabo', 'mundo letras', 'Gabriel García Márqu..."


In [5]:
# Check if 'is_impossible' exists in your df
if 'is_impossible' in df.columns:
    impossible_prop = df['is_impossible'].value_counts(normalize=True)
    print("Proportion of possible vs impossible answers:")
    print(impossible_prop)
else:
    print("No 'is_impossible' column found. Please provide the correct column name.")


Proportion of possible vs impossible answers:
is_impossible
False    0.715215
True     0.284785
Name: proportion, dtype: float64


In [6]:
# Clean up missing or stringified boolean values
df["is_impossible"] = df["is_impossible"].astype(bool)

In [7]:
def find_answer_span(context, answer):
    """Find start and end index of answer in context. Returns None, None if not found."""
    if not isinstance(answer, str) or not answer.strip():
        return None, None
    start = context.find(answer)
    if start == -1:
        return None, None
    return start, start + len(answer)

records = []

for idx, row in df.iterrows():
    context = row['context']
    answer_text = row['answer']
    impossible = row.get('is_impossible', False)  # boolean flag

    if impossible:
        # For impossible answers, treat NaN or empty string as valid "no answer"
        found = False
        if isinstance(answer_text, str) and answer_text.strip() == '':
            found = True  # empty string is fine
        elif pd.isna(answer_text):
            found = True  # NaN is fine
        records.append({
            'index': idx,
            'answer': answer_text,
            'is_impossible': True,
            'found': found
        })
    else:
        # For possible answers
        if not isinstance(answer_text, str):
            answer_text = ''  # safety fallback
        start, end = find_answer_span(context, answer_text)
        records.append({
            'index': idx,
            'answer': answer_text,
            'is_impossible': False,
            'found': start is not None,
            'start': start,
            'end': end
        })

records_df = pd.DataFrame(records)
print(records_df.head())

# Check problematic answers
not_found = records_df[~records_df['found']]
print(f"Total answers not found (including impossible answers with text): {len(not_found)}")

   index                      answer  is_impossible  found   start     end
0      0            cáncer linfático          False   True   679.0   695.0
1      1         Aracataca, Colombia          False   True  1520.0  1539.0
2      2        Cien años de soledad          False   True  2114.0  2134.0
3      3  Premio Nobel de Literatura          False   True  2929.0  2955.0
4      4                     15 años          False   True   655.0   662.0
Total answers not found (including impossible answers with text): 2447


In [8]:
# Keep only rows where answers were found
df_valid = df.loc[records_df['found']].reset_index(drop=True)

print(f"Original dataset size: {len(df)}")
print(f"Valid dataset size (answers found): {len(df_valid)}")

Original dataset size: 30286
Valid dataset size (answers found): 27839


In [9]:
# Check if 'is_impossible' exists in your df
if 'is_impossible' in df_valid.columns:
    impossible_prop = df_valid['is_impossible'].value_counts(normalize=True)
    print("Proportion of possible vs impossible answers:")
    print(impossible_prop)
else:
    print("No 'is_impossible' column found. Please provide the correct column name.")

Proportion of possible vs impossible answers:
is_impossible
False    0.693416
True     0.306584
Name: proportion, dtype: float64


In [10]:
# Create a combined stratification key
df_valid['stratify_key'] = df_valid['country'].astype(str) + "_" + df_valid['category'].astype(str)

# Perform stratified split
train_df, eval_df = train_test_split(
    df_valid,
    test_size=0.3,               # 30% for evaluation
    random_state=42,             # for reproducibility
    stratify=df_valid['stratify_key']  # stratification
)


# Drop the temporary column if you want
train_df = train_df.drop(columns=['stratify_key'])
eval_df = eval_df.drop(columns=['stratify_key'])

print(f"Train size: {len(train_df)}, Eval size: {len(eval_df)}")

Train size: 19487, Eval size: 8352


In [11]:
# Separate Colombian data
train_colombia = train_df[train_df['country'] == 'Colombia']
eval_colombia = eval_df[eval_df['country'] == 'Colombia']

# Separate Mexican data
train_mexico = train_df[train_df['country'] == 'Mexico']
eval_mexico = eval_df[eval_df['country'] == 'Mexico']

# Optional: check sizes
print(f"Train Colombia: {len(train_colombia)}, Eval Colombia: {len(eval_colombia)}")
print(f"Train Mexico: {len(train_mexico)}, Eval Mexico: {len(eval_mexico)}")

Train Colombia: 13648, Eval Colombia: 5849
Train Mexico: 5839, Eval Mexico: 2503


In [12]:
# Function to calculate proportions
def get_proportions(df):
    country_prop = df['country'].value_counts(normalize=True)
    category_prop = df['category'].value_counts(normalize=True)
    cc_prop = df.groupby(['country','category']).size() / len(df)
    return country_prop, category_prop, cc_prop

train_country, train_cat, train_cc = get_proportions(train_df)
eval_country, eval_cat, eval_cc = get_proportions(eval_df)

# Combine in a single DataFrame for comparison
country_compare = pd.concat([train_country, eval_country], axis=1)
country_compare.columns = ['train', 'eval']
category_compare = pd.concat([train_cat, eval_cat], axis=1)
category_compare.columns = ['train', 'eval']
cc_compare = pd.concat([train_cc, eval_cc], axis=1)
cc_compare.columns = ['train', 'eval']

print("--- Country Proportions ---")
print(country_compare)
print("\n--- Category Proportions ---")
print(category_compare)
print("\n--- Country + Category Proportions ---")
print(cc_compare)

--- Country Proportions ---
             train      eval
country                     
Colombia  0.700364  0.700311
Mexico    0.299636  0.299689

--- Category Proportions ---
                  train      eval
category                         
cultura        0.371838  0.371887
politica       0.272695  0.272629
internacional  0.181967  0.181873
salud          0.110997  0.110991
economia       0.062503  0.062620

--- Country + Category Proportions ---
                           train      eval
country  category                         
Colombia cultura        0.259660  0.259698
         economia       0.043054  0.043103
         internacional  0.128445  0.128352
         politica       0.191410  0.191331
         salud          0.077795  0.077826
Mexico   cultura        0.112177  0.112189
         economia       0.019449  0.019516
         internacional  0.053523  0.053520
         politica       0.081285  0.081298
         salud          0.033202  0.033166


In [13]:
# Function to compute category proportions within each country
def category_props_by_country(df):
    return df.groupby('country')['category'].value_counts(normalize=True).unstack(fill_value=0)

train_cat_country = category_props_by_country(train_df)
eval_cat_country = category_props_by_country(eval_df)

# Combine train and eval for easy comparison
comparison = pd.concat([train_cat_country.add_suffix('_train'),
                        eval_cat_country.add_suffix('_eval')], axis=1)

comparison

category,cultura_train,economia_train,internacional_train,politica_train,salud_train,cultura_eval,economia_eval,internacional_eval,politica_eval,salud_eval
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Colombia,0.37075,0.061474,0.183397,0.2733,0.111079,0.370833,0.061549,0.183279,0.273209,0.11113
Mexico,0.374379,0.064908,0.178626,0.271279,0.110807,0.374351,0.065122,0.178586,0.271274,0.110667


In [14]:
def find_answer_span(context, answer):
    """Find start index of answer in context."""
    if not isinstance(answer, str) or not answer.strip():
        return None, None
    start = context.find(answer)
    if start == -1:
        return None, None
    return start, start + len(answer)

records = []

In [15]:
def build_qa_records(df_subset):
    """Convert a DataFrame to QA-style records grouped by title+context."""
    records = []
    for (title, context), group in df_subset.groupby(["title", "context"]):
        qas = []
        for i, row in group.iterrows():
            if row.get("is_impossible", False):
                qas.append({
                    "id": f"qa-{i}",
                    "question": row["question"],
                    "is_impossible": True,
                    "answers": []
                })
            else:
                start, end = find_answer_span(context, row["answer"])
                if start is None:
                    continue  # skip answers not found
                qas.append({
                    "id": f"qa-{i}",
                    "question": row["question"],
                    "is_impossible": False,
                    "answers": [{"text": row["answer"], "answer_start": start}]
                })
        if qas:  # only include if there are valid QAs
            records.append({
                "title": title,
                "paragraphs": [{
                    "context": context,
                    "qas": qas
                }]
            })
    return records

# ---- Apply to train/eval subsets ----


# Build QA records for the main datasets
train_records = build_qa_records(train_df)
eval_records = build_qa_records(eval_df)
valid_records = build_qa_records(df_valid)


# Build QA records
train_colombia_records = build_qa_records(train_colombia)
eval_colombia_records = build_qa_records(eval_colombia)
train_mexico_records = build_qa_records(train_mexico)
eval_mexico_records = build_qa_records(eval_mexico)


# Optional: check sizes
print(f"Valid QA records: {len(valid_records)}")
print(f"Train QA records: {len(train_records)}")
print(f"Eval QA records: {len(eval_records)}")


# Optional: check lengths
print(f"Train Colombia: {len(train_colombia_records)} records")
print(f"Eval Colombia: {len(eval_colombia_records)} records")
print(f"Train Mexico: {len(train_mexico_records)} records")
print(f"Eval Mexico: {len(eval_mexico_records)} records")

Valid QA records: 4329
Train QA records: 4320
Eval QA records: 3846
Train Colombia: 3023 records
Eval Colombia: 2690 records
Train Mexico: 1297 records
Eval Mexico: 1156 records


In [16]:
def count_questions(records):
    """Count total number of questions in QA records."""
    total = 0
    for rec in records:
        for para in rec['paragraphs']:
            total += len(para['qas'])
    return total

print(f"Valid QA questions: {count_questions(valid_records)}")
print(f"Train QA questions: {count_questions(train_records)}")
print(f"Eval QA questions: {count_questions(eval_records)}")
print(f"Train Colombia QA questions: {count_questions(train_colombia_records)}")
print(f"Eval Colombia QA questions: {count_questions(eval_colombia_records)}")
print(f"Train Mexico QA questions: {count_questions(train_mexico_records)}")
print(f"Eval Mexico QA questions: {count_questions(eval_mexico_records)}")


Valid QA questions: 27839
Train QA questions: 19487
Eval QA questions: 8352
Train Colombia QA questions: 13648
Eval Colombia QA questions: 5849
Train Mexico QA questions: 5839
Eval Mexico QA questions: 2503


In [17]:
dataset = {"data": records}

In [18]:
dataset

{'data': []}

In [19]:
# Dictionary of datasets and their filenames
datasets = {
    "train_colombia_mexico_dataset.json": {"data": train_records},
    "eval_colombia_mexico_dataset.json": {"data": eval_records},
    "valid_complete_colombia_mexico_dataset.json": {"data": valid_records},
    "train_colombia_dataset.json": {"data": train_colombia_records},
    "eval_colombia_dataset.json": {"data": eval_colombia_records},
    "train_mexico_dataset.json": {"data": train_mexico_records},
    "eval_mexico_dataset.json": {"data": eval_mexico_records},
}

# Save all datasets to GOLD_PATH
for filename, data in datasets.items():
    file_path = os.path.join(GOLD_PATH, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

print(f"All datasets saved in {GOLD_PATH}!")

All datasets saved in ../datasets/exploration_datasets/gold/!


In [20]:
train_records[9]

{'title': '"No basta" el aumento salarial del 9 por ciento: CNTE',
 'paragraphs': [{'context': '. Ciudad de México. Poco antes del mediodía y con un sol a plomo, miles de maestros arribaron a la plancha del zócalo capitalino que permanece ocupada por cientos de casas de campaña multicolores, que se desbordan hasta las calles aledañas como Palma, 20 de noviembre y 5 de febrero. En el mitin político, que realizan frente a Palacio Nacional, la dirigencia de la Coordinadora Nacional de Trabajadores de la Educación (CNTE) señaló que no basta con el anuncio de incremento salarial dado a conocer por el gobierno federal, de un 9 por ciento global para todos los trabajadores de la educación. Pedro Hernández Morales, secretario general de la sección 9 de Ciudad de México, afirmó que hoy muy cerca de aquí se está festejando a un sector de los maestros, y se está dando un anuncio de un incremento salarial, pero aquí estamos quienes buscamos recuperar un derecho que nos han quitado en beneficio de 