## Jeopardy NER Data Curation

In [1]:
import json
import pandas as pd
from pathlib import Path

# Path to the dataset (place JEOPARDY_QUESTIONS1.json in the notebook folder or update this path)
DATA_PATH = Path('E:\\Cliente\\Desktop\\Career\\MotorolaSolutions_Case\\JEOPARDY_QUESTIONS1.json')

def load_json_flex(path):
    """Robust loader for either a JSON array file or a JSON Lines file.
    Returns a list of dicts. Raises FileNotFoundError if file missing.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f'File not found: {p}')
    with p.open('r', encoding='utf-8') as f:
        # read a small prefix to detect format
        start = f.read(2048)
        f.seek(0)
        s = start.lstrip()
        if s.startswith('['):
            # JSON array: load entire structure
            return json.load(f)
        # otherwise try JSON lines (one JSON object per line)
        data = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                # a line failed to parse -> try loading entire file as a single JSON
                f.seek(0)
                return json.load(f)
        return data

# Load dataset and convert to DataFrame for exploration
data = load_json_flex(DATA_PATH)
print('Loaded records:', len(data))

df = pd.DataFrame(data)
print('Columns:', df.columns.tolist())
print('\nFirst 10 rows (preview):')
display(df.head(10))

print('\nMissing values per column:')
print(df.isnull().sum())

# Quick probe to find likely text columns (common keys in Jeopardy dumps)
for col in ['question', 'Question', 'question_text', 'answer', 'Answer', 'category', 'Category']:
    if col in df.columns:
        print(f"\nColumn: {col} -- non-null: {df[col].notnull().sum()}")
        print(df[col].dropna().astype(str).head(5).tolist())


# Notes:
# - Next steps: implement feature detectors (numbers, non-English tokens, unusual proper nouns),
#   apply them across df to estimate counts and then sample 1000 examples per stratum.
# - I documented the loader helper above. We'll add more helper functions in subsequent steps.

Loaded records: 216930
Columns: ['category', 'air_date', 'question', 'value', 'answer', 'round', 'show_number']

First 10 rows (preview):
Columns: ['category', 'air_date', 'question', 'value', 'answer', 'round', 'show_number']

First 10 rows (preview):


Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680
5,3-LETTER WORDS,2004-12-31,"'In the title of an Aesop fable, this insect s...",$200,the ant,Jeopardy!,4680
6,HISTORY,2004-12-31,'Built in 312 B.C. to link Rome & the South of...,$400,the Appian Way,Jeopardy!,4680
7,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 8: 30 steals for the Birmingham Barons; 2...,$400,Michael Jordan,Jeopardy!,4680
8,EVERYBODY TALKS ABOUT IT...,2004-12-31,"'In the winter of 1971-72, a record 1,122 inch...",$400,Washington,Jeopardy!,4680
9,THE COMPANY LINE,2004-12-31,'This housewares store was named for the packa...,$400,Crate & Barrel,Jeopardy!,4680



Missing values per column:
category          0
air_date          0
question          0
value          3634
answer            0
round             0
show_number       0
dtype: int64

Column: question -- non-null: 216930
["'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", "'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'", "'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'", '\'In 1963, live on "The Art Linkletter Show", this company served its billionth burger\'', "'Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States'"]

Column: answer -- non-null: 216930
['Copernicus', 'Jim Thorpe', 'Arizona', "McDonald\\'s", 'John Adams']

Column: category -- non-null: 216930
['HISTORY', "ESPN's TOP 10 ALL-TIME ATHLETES", 'EVERYBODY TALKS ABOUT IT...', 'THE COMPANY LINE', 'EPITAPHS & TRIBUTES']
cate

In [2]:
print(df)

                               category    air_date  \
0                               HISTORY  2004-12-31   
1       ESPN's TOP 10 ALL-TIME ATHLETES  2004-12-31   
2           EVERYBODY TALKS ABOUT IT...  2004-12-31   
3                      THE COMPANY LINE  2004-12-31   
4                   EPITAPHS & TRIBUTES  2004-12-31   
...                                 ...         ...   
216925                   RIDDLE ME THIS  2006-05-11   
216926                        "T" BIRDS  2006-05-11   
216927           AUTHORS IN THEIR YOUTH  2006-05-11   
216928                       QUOTATIONS  2006-05-11   
216929                   HISTORIC NAMES  2006-05-11   

                                                 question  value  \
0       'For the last 8 years of his life, Galileo was...   $200   
1       'No. 2: 1912 Olympian; football star at Carlis...   $200   
2       'The city of Yuma in this state has a record a...   $200   
3       'In 1963, live on "The Art Linkletter Show", t...   $200   

## Data Cleaning

In [3]:
# Clean original dataset of scraping errors in Question field www.j-archive.com/media.
# The error seems to be a URL mistakenly placed in the question field and includes "www.j-archive.com/media" in the text.
# Print number of rows before and after cleaning.

initial_size = len(df)
print("Number of rows before cleaning:", initial_size)

# Ensure the 'question' column exists
if 'question' not in df.columns:
    raise KeyError("Column 'question' not found in the dataframe")

# Identify rows to drop
rows_to_drop = df[df['question'].str.contains("www.j-archive.com/media", na=False, regex=False)].index

# Drop them explicitly
df = df.drop(index=rows_to_drop)

print("Number of rows after cleaning:", len(df))
print("Number of removed samples with error in Question field:", len(rows_to_drop))


Number of rows before cleaning: 216930
Number of rows after cleaning: 206408
Number of removed samples with error in Question field: 10522
Number of rows after cleaning: 206408
Number of removed samples with error in Question field: 10522


In [4]:
# Clean original dataset of <br /> HTML tags in Question field.
# Print number of rows before and after cleaning.

initial_size = len(df)
print("Number of rows before cleaning:", initial_size)

# Ensure 'question' column exists
if 'question' not in df.columns:
    raise KeyError("Column 'question' not found in the dataframe")

# Identify rows containing <br /> (HTML line breaks)
rows_to_drop = df[df['question'].str.contains("<br />", na=False, regex=False)].index

# Drop those rows
df = df.drop(index=rows_to_drop)

print("Number of rows after cleaning:", len(df))
print("Number of removed samples with '<br />' in Question field:", len(rows_to_drop))


Number of rows before cleaning: 206408
Number of rows after cleaning: 195415
Number of removed samples with '<br />' in Question field: 10993
Number of rows after cleaning: 195415
Number of removed samples with '<br />' in Question field: 10993


In [5]:
#Replace "&" with "and" in Question field.

#Print number of rows containing "&" before cleaning.
initial_size = len(df[df['question'].str.contains("&", na=False, regex=False)])
print("Number of rows with '&':", initial_size)

#Replace '&' with 'and' in Question field.
df['question'] = df['question'].str.replace("&", "and", regex=False)


Number of rows with '&': 39833


## Create datasets of 1000 examples for (NER) algorithm performance comparison

### Numbers strata

In [6]:
# Strata with phrases containing numbers  
import re
from collections import Counter

# We'll use the Hugging Face tokenizer for robust tokenization (subword-safe).
# The tokenizer is only used for token-level heuristics (no model inference here).
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

# Regex patterns for numeric forms
digits_re = re.compile(r"\d+")
ordinal_re = re.compile(r"\b\d+(?:st|nd|rd|th)\b", flags=re.I)
year_re = re.compile(r"\b(?:18|19|20)\d{2}\b")

# A modest set of written number words (expandable). Keep them lowercase.
NUMBER_WORDS = {
    'zero','one','two','three','four','five','six','seven','eight','nine','ten',
    'eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen','nineteen',
    'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety',
    'hundred','thousand','million','billion','trillion','dozen',
    'first','second','third','fourth','fifth','sixth','seventh','eighth','ninth','tenth',
    'hundredth','thousandth','millionth','1st','2nd','3rd','4th','5th','6th','7th','8th','9th',
    '0th', '1th', '2th', '3th',
}

# Pattern matches complete words only (this is the primary safe check).
pattern = re.compile(r"\b(" + "|".join(re.escape(word) for word in NUMBER_WORDS) + r")\b", re.IGNORECASE)

def tokenize_text(text):
    """Return token list using HF tokenizer if available, else simple word split.
    Do NOT strip WordPiece prefixes — keep them so we can filter subword tokens (start with '##').
    """
    if not text or not isinstance(text, str):
        return []
    if tokenizer is not None:
        toks = tokenizer.tokenize(text)
        # return tokens lowercased but preserve '##' prefix for subword detection
        return [t.lower() for t in toks if t.strip()]
    # fallback: split on non-word characters (safe whole-word tokens)
    return re.findall(r"\w+", text.lower())

def detect_numbers(text):
    """Detect numeric evidence in text.
    Use regex whole-word matches first; then a token-level check that ignores subword pieces.
    Returns a dict with boolean flags and a small summary.
    """
    res = {
        'has_digits': False,
        'has_ordinals': False,
        'has_year': False,
        'has_number_word': False,
        'matched_number_words': []
    }
    if not text or not isinstance(text, str):
        return res
    # regex-based checks
    if digits_re.search(text):
        res['has_digits'] = True
    if ordinal_re.search(text):
        res['has_ordinals'] = True
    if year_re.search(text):
        res['has_year'] = True
    # PRIMARY: whole-word regex for number words (prevents 'gluten' -> 'ten')
    regex_matches = pattern.findall(text)
    if regex_matches:
        res['has_number_word'] = True
        res['matched_number_words'] = list(dict.fromkeys(m.lower() for m in regex_matches))
        return res
    # SECONDARY: token-based check but ignore subword tokens (WordPiece prefixed with '##')
    toks = tokenize_text(text)
    matched_tokens = []
    for t in toks:
        # skip WordPiece continuations (e.g., '##ten') to avoid false positives inside words
        if t.startswith('##'):
            continue
        if t in NUMBER_WORDS:
            matched_tokens.append(t)
    if matched_tokens:
        res['has_number_word'] = True
        res['matched_number_words'] = list(dict.fromkeys(matched_tokens))
    return res

# Choose candidate text columns (common variants). Only apply to question columns as requested.
candidate_question_cols = [c for c in ['question','Question','question_text'] if c in df.columns]

print('Question columns found:', candidate_question_cols)

# Apply detectors and add boolean summary columns only for question fields.
for qcol in candidate_question_cols:
    prefix = 'q_' + qcol
    df[prefix + '_has_digits'] = df[qcol].fillna('').astype(str).apply(lambda t: detect_numbers(t)['has_digits'])
    df[prefix + '_has_ordinals'] = df[qcol].fillna('').astype(str).apply(lambda t: detect_numbers(t)['has_ordinals'])
    df[prefix + '_has_year'] = df[qcol].fillna('').astype(str).apply(lambda t: detect_numbers(t)['has_year'])
    df[prefix + '_has_number_word'] = df[qcol].fillna('').astype(str).apply(lambda t: detect_numbers(t)['has_number_word'])

# Summarize counts to estimate prevalence (print a few stats).
summary = {}
for col in df.columns:
    if col.endswith('_has_digits') or col.endswith('_has_ordinals') or col.endswith('_has_year') or col.endswith('_has_number_word'):
        summary[col] = int(df[col].sum())

print('\nNumber feature summary (counts):')
for k, v in sorted(summary.items()):
    print(f"{k}: {v}")

# Show examples where question contains digits or number words
show_qcol = candidate_question_cols[0] if candidate_question_cols else None
if show_qcol:
    print(f"\nSample questions with digits ({show_qcol}):")
    display(df[df['q_'+show_qcol+'_has_digits']][[show_qcol]].head(10))
    print(f"\nSample questions with number words ({show_qcol}):")
    display(df[df['q_'+show_qcol+'_has_number_word']][[show_qcol]].head(10))

# The detector functions are documented inline above. Next: refine thresholds and sampling logic when
# building the numeric stratum of 1000 examples (we will sample from rows with these question-only flags).

Question columns found: ['question']

Number feature summary (counts):
q_question_has_digits: 76634
q_question_has_number_word: 27483
q_question_has_ordinals: 4669
q_question_has_year: 40742

Sample questions with digits (question):

Number feature summary (counts):
q_question_has_digits: 76634
q_question_has_number_word: 27483
q_question_has_ordinals: 4669
q_question_has_year: 40742

Sample questions with digits (question):


Unnamed: 0,question
0,"'For the last 8 years of his life, Galileo was..."
1,'No. 2: 1912 Olympian; football star at Carlis...
2,'The city of Yuma in this state has a record a...
3,"'In 1963, live on ""The Art Linkletter Show"", t..."
6,'Built in 312 B.C. to link Rome and the South ...
7,'No. 8: 30 steals for the Birmingham Barons; 2...
8,"'In the winter of 1971-72, a record 1,122 inch..."
12,'In 1000 Rajaraja I of the Cholas battled to t...
13,"'No. 1: Lettered in hoops, football and lacros..."
14,"'On June 28, 1994 the nat'l weather service be..."



Sample questions with number words (question):


Unnamed: 0,question
3,"'In 1963, live on ""The Art Linkletter Show"", t..."
4,"'Signer of the Dec. of Indep., framer of the C..."
9,'This housewares store was named for the packa...
11,'Cows regurgitate this from the first stomach ...
15,"'This company's Accutron watch, introduced in ..."
18,'Karl led the first of these Marxist organizat...
20,'Africa's lowest temperature was 11 degrees be...
23,"'In geologic time one of these, shorter than a..."
26,"'The Kirschner brothers, Don and Bill, named t..."
28,"'A single layer of paper, or to perform one's ..."


In [7]:
# Build combined 'any' flags (question OR answer) for numeric features
qcol = candidate_question_cols[0] if candidate_question_cols else None

def _flag_cols(prefix, flag):
    """Return list of existing cols for a given prefix and flag suffix."""
    if prefix is None:
        return []
    col = f"{prefix}_{flag}"
    return [col] if col in df.columns else []

q_pref = ('q_' + qcol) if qcol else None

digit_cols = _flag_cols(q_pref, 'has_digits')
ordinal_cols = _flag_cols(q_pref, 'has_ordinals')
year_cols = _flag_cols(q_pref, 'has_year')
numword_cols = _flag_cols(q_pref, 'has_number_word')

# Create any-flag columns safely (if no contributing columns exist, default False)
df['has_digits_any'] = df[digit_cols].any(axis=1) if digit_cols else False
df['has_ordinals_any'] = df[ordinal_cols].any(axis=1) if ordinal_cols else False
df['has_year_any'] = df[year_cols].any(axis=1) if year_cols else False
df['has_number_word_any'] = df[numword_cols].any(axis=1) if numword_cols else False

# Build a human-readable stratum label from the flags
def make_stratum_label(row):
    parts = []
    if row['has_digits_any']:
        parts.append('digits')
    if row['has_number_word_any']:
        parts.append('number_word')
    if row['has_ordinals_any']:
        parts.append('ordinal')
    if row['has_year_any']:
        parts.append('year')
    return '+'.join(parts)

df['num_stratum'] = df.apply(make_stratum_label, axis=1)

# Filter numeric stratum (at least one numeric signal)
numeric_stratum = df[df['num_stratum'] != '']
total_candidates = len(numeric_stratum)
print(f'Total numeric candidates: {total_candidates}')

if total_candidates == 0:
    print('No numeric candidates found — cannot sample.')
else:
    # Counts per stratum
    counts = numeric_stratum['num_stratum'].value_counts().sort_values(ascending=False)
    print('\nStratum counts (top):')
    print(counts.head(20))

    # Proportional allocation of 1000 samples to strata (integer rounding + distribute remainder)
    n_target = 1000
    props = (counts / counts.sum())
    initial_alloc = (props * n_target).astype(int)
    alloc = initial_alloc.to_dict()
    allocated = sum(alloc.values())
    remainder = n_target - allocated
    if remainder > 0:
        # distribute remainder to strata with largest fractional parts
        fractional = (props * n_target) - initial_alloc
        for label in fractional.sort_values(ascending=False).index[:remainder]:
            alloc[label] += 1

    # Ensure we do not allocate more than available in any stratum
    for label in list(alloc.keys()):
        avail = int(counts.get(label, 0))
        if alloc[label] > avail:
            alloc[label] = avail

    # If we've dropped below 1000 due to limited availabilities, fill from remaining pool
    allocated_after_cap = sum(alloc.values())
    remaining_needed = n_target - allocated_after_cap

    samples = []
    rng = 42
    for label, n_alloc in alloc.items():
        if n_alloc <= 0:
            continue
        pool = numeric_stratum[numeric_stratum['num_stratum'] == label]
        # sample min(n_alloc, len(pool)) to be safe
        take = min(n_alloc, len(pool))
        samples.append(pool.sample(n=take, random_state=rng))

    numbers_df = pd.concat(samples) if samples else numeric_stratum.sample(n=0)

    # If still need more (some strata were smaller than alloc), fill from remaining candidates
    if remaining_needed > 0:
        remaining_pool = numeric_stratum.drop(numbers_df.index, errors='ignore')
        if len(remaining_pool) >= remaining_needed:
            numbers_df = pd.concat([numbers_df, remaining_pool.sample(n=remaining_needed, random_state=rng)])
        else:
            numbers_df = pd.concat([numbers_df, remaining_pool])

    # Final safety: if over 1000 (shouldn't happen), truncate; if under 1000, report actual size
    if len(numbers_df) > n_target:
        numbers_df = numbers_df.sample(n=n_target, random_state=rng)

    print(f'Final sampled count: {len(numbers_df)} (target {n_target})')
    print('\nSampled stratum distribution:')
    print(numbers_df['num_stratum'].value_counts())

    # Save to jsonl for downstream use
    out_path = Path('numeric_stratum_sample_1000.jsonl')
    numbers_df.to_json(out_path, orient='records', lines=True, force_ascii=False)
    print('Wrote sample to', out_path)

    # Show a few examples for quick inspection
    display(numbers_df.head(10))

Total numeric candidates: 90274

Stratum counts (top):
num_stratum
digits+year                        33028
digits                             27173
number_word                        13640
digits+number_word+year             6579
digits+number_word                  5185
digits+ordinal                      2011
digits+number_word+ordinal          1523
digits+ordinal+year                  579
digits+number_word+ordinal+year      556
Name: count, dtype: int64
Final sampled count: 1000 (target 1000)

Sampled stratum distribution:
num_stratum
digits+year                        366
digits                             301
number_word                        151
digits+number_word+year             73
digits+number_word                  58
digits+ordinal                      22
digits+number_word+ordinal          17
digits+ordinal+year                  6
digits+number_word+ordinal+year      6
Name: count, dtype: int64
Wrote sample to numeric_stratum_sample_1000.jsonl
Wrote sample to numeric_stra

Unnamed: 0,category,air_date,question,value,answer,round,show_number,q_question_has_digits,q_question_has_ordinals,q_question_has_year,q_question_has_number_word,has_digits_any,has_ordinals_any,has_year_any,has_number_word_any,num_stratum
123290,1804,2004-11-29,"'This inventor's Feb. 7, 1804 birth made Willi...",$200,John Deere,Jeopardy!,4656,True,False,True,False,True,False,True,False,digits+year
38116,FRENCH CITIES,1999-09-21,'This city's Grand Prix d'Endurance has been r...,$400,Le Mans,Double Jeopardy!,3457,True,False,True,False,True,False,True,False,digits+year
78262,THE THINGS YOU SAY!,2010-09-13,"'""Speak"" this ""to power"", meaning tell the mig...",$1200,truth,Double Jeopardy!,5976,True,False,True,False,True,False,True,False,digits+year
104884,EXCLAMATION POINTS,1998-07-07,"'In 1963 he put up his ""Dukes"" as the title ch...",$400,John Wayne,Double Jeopardy!,3207,True,False,True,False,True,False,True,False,digits+year
106934,STOCK ANSWERS,2004-12-20,"'In 1958 this ""I Love Lucy"" production company...",$1000,Desilu,Jeopardy!,4671,True,False,True,False,True,False,True,False,digits+year
43985,THE NEW YORK TIMES MOVIES,2011-05-04,'Reviews at nytimes. com include Renata Adler'...,$600,2001: A Space Odyssey,Jeopardy!,6143,True,False,True,False,True,False,True,False,digits+year
73597,SITCOM MOMS,2010-10-27,'From 2001 to 2007 this country singer played ...,$1000,Reba McEntire,Jeopardy!,6008,True,False,True,False,True,False,True,False,digits+year
129342,COMPANY HISTORIES,2001-11-30,'In 1959 a company in this state got the right...,$2000,Ohio,Double Jeopardy!,3970,True,False,True,False,True,False,True,False,digits+year
85866,SOLDIERS,1999-10-05,"'This French minister of war died in 1932, too...",$400,Andre Maginot,Double Jeopardy!,3467,True,False,True,False,True,False,True,False,digits+year
200250,JIMMY,1999-10-11,'He's won the U.S Open on 3 surfaces: grass (1...,$200,JImmy Connors,Jeopardy!,3471,True,False,True,False,True,False,True,False,digits+year


In [None]:
print(numbers_df)

#### Non-English words Strata

**Non-English With transformers - Lower results. Kept for comparison purposes**

In [10]:
# Using Transformers
# Non-English detection: HF language classifier + diacritics + token frequency heuristics
import sys, subprocess
from pathlib import Path
from typing import Dict

# 1) Ensure required packages
try:
    from transformers import pipeline
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers'])
    from transformers import pipeline

try:
    from wordfreq import zipf_frequency
except Exception:
    # Install wordfreq for English token frequency heuristics
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'wordfreq'])
    from wordfreq import zipf_frequency

# 2) Load a language-id model via HF text-classification pipeline.
# 'papluca/xlm-roberta-base-language-detection' is a common HF model for language detection.
model_name = 'papluca/xlm-roberta-base-language-detection'
try:
    lang_pipe = pipeline('text-classification', model=model_name, return_all_scores=False)
except Exception as e:
    # Fallback: try a simpler/langid model if available on machine
    print('Could not load', model_name, '->', e)
    lang_pipe = None

# Helper: detect language with confidence
def detect_lang(text: str) -> Dict[str, float]:
    """Return {'lang': code, 'score': float} or {'lang': None, 'score': 0.0} on failure."""
    if not text or not isinstance(text, str):
        return {'lang': None, 'score': 0.0}
    if lang_pipe is None:
        return {'lang': None, 'score': 0.0}
    try:
        # classifier expects short inputs; truncate to 512 chars for speed
        out = lang_pipe(text[:512])
        if isinstance(out, list) and len(out) > 0:
            o = out[0]
            # label may be like 'en' or 'LABEL_XX' depending on model; normalize lowercase
            return {'lang': o.get('label').lower(), 'score': float(o.get('score', 0.0))}
    except Exception:
        return {'lang': None, 'score': 0.0}
    return {'lang': None, 'score': 0.0}

# Helper: diacritics/ non-ASCII detection
def has_diacritics(text: str) -> bool:
    return any(ord(c) > 127 for c in text) if text else False

# Helper: fraction of tokens that are likely English using wordfreq
import re
token_re = re.compile(r"[A-Za-z]+")
def english_token_fraction(text: str) -> float:
    """Return fraction of alphabetic tokens with reasonable English frequency (>1.0 zipf)."""
    if not text or not isinstance(text, str):
        return 0.0
    toks = token_re.findall(text.lower())
    if not toks:
        return 0.0
    scores = [zipf_frequency(t, 'en') for t in toks]
    # zipf_frequency returns -inf for unknown tokens; treat < 1.0 as unlikely English (tunable)
    good = sum(1 for s in scores if s >= 1.0)
    return good / len(toks)

# Decision rule: mark non-English if:
#  - HF language prediction exists and lang != 'en' with confidence > 0.7 OR
#  - diacritics present OR
#  - english_token_fraction < 0.5 (more than half tokens not recognized as English)
CONF_THRESH = 0.7

def is_non_english(text: str) -> Dict:
    """Return dict with indicators and metadata."""
    res = {'lang': None, 'lang_conf': 0.0, 'diacritics': False, 'eng_token_frac': 0.0, 'non_english': False}
    if not text or not isinstance(text, str):
        return res
    det = detect_lang(text)
    res['lang'] = det.get('lang')
    res['lang_conf'] = det.get('score', 0.0)
    res['diacritics'] = has_diacritics(text)
    res['eng_token_frac'] = english_token_fraction(text)
    # rule
    noneng = False
    if res['lang'] and res['lang'] != 'en' and res['lang_conf'] >= CONF_THRESH:
        noneng = True
    if res['diacritics']:
        noneng = True
    if res['eng_token_frac'] < 0.5:
        noneng = True
    res['non_english'] = noneng
    return res

# Apply to both question and answer (if available)
q_cols = [c for c in ['question','Question','question_text'] if c in df.columns]
a_cols = [c for c in ['answer','Answer'] if c in df.columns]
print('Applying non-English detection to Q cols:', q_cols, 'A cols:', a_cols)

for q in q_cols:
    meta = df[q].fillna('').astype(str).apply(is_non_english)
    df['q_'+q+'_lang'] = meta.apply(lambda d: d['lang'])
    df['q_'+q+'_lang_conf'] = meta.apply(lambda d: d['lang_conf'])
    df['q_'+q+'_has_diacritics'] = meta.apply(lambda d: d['diacritics'])
    df['q_'+q+'_eng_token_frac'] = meta.apply(lambda d: d['eng_token_frac'])
    df['q_'+q+'_non_english'] = meta.apply(lambda d: d['non_english'])

for a in a_cols:
    meta = df[a].fillna('').astype(str).apply(is_non_english)
    df['a_'+a+'_lang'] = meta.apply(lambda d: d['lang'])
    df['a_'+a+'_lang_conf'] = meta.apply(lambda d: d['lang_conf'])
    df['a_'+a+'_has_diacritics'] = meta.apply(lambda d: d['diacritics'])
    df['a_'+a+'_eng_token_frac'] = meta.apply(lambda d: d['eng_token_frac'])
    df['a_'+a+'_non_english'] = meta.apply(lambda d: d['non_english'])

# Combined flag (question OR answer non-English)
q_flag_cols = [c for c in df.columns if c.startswith('q_') and c.endswith('_non_english')]
a_flag_cols = [c for c in df.columns if c.startswith('a_') and c.endswith('_non_english')]
combined_cols = q_flag_cols + a_flag_cols
if combined_cols:
    df['non_english_any'] = df[combined_cols].any(axis=1)
else:
    df['non_english_any'] = False

# Summary
print('\nNon-English summary:')
for col in q_flag_cols + a_flag_cols + ['non_english_any']:
    print(f"{col}: {int(df[col].sum()) if col in df.columns else 0}")

# Show samples flagged as non-English
if 'non_english_any' in df.columns and df['non_english_any'].any():
    print('\nSample non-English rows (first 10):')
    sample_cols = (q_cols[:1] if q_cols else []) + (a_cols[:1] if a_cols else []) + ['non_english_any']
    display(df[df['non_english_any']][sample_cols].head(10))
else:
    print('No non-English rows detected by rule.')

# Notes:
# - detect_lang uses an HF text-classification model; model labels vary by checkpoint.
# - english_token_fraction uses wordfreq zipf_frequency: this acts as a fast proxy for "word in English" check.
# - thresholds (CONF_THRESH and eng_token_frac) are tunable for precision/recall tradeoffs.

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


Applying non-English detection to Q cols: ['question'] A cols: ['answer']

Non-English summary:
q_question_non_english: 3390
a_answer_non_english: 44728
non_english_any: 47142

Sample non-English rows (first 10):

Non-English summary:
q_question_non_english: 3390
a_answer_non_english: 44728
non_english_any: 47142

Sample non-English rows (first 10):


Unnamed: 0,question,answer,non_english_any
15,"'This company's Accutron watch, introduced in ...",Bulova,True
20,'Africa's lowest temperature was 11 degrees be...,Morocco,True
23,"'In geologic time one of these, shorter than a...",era,True
26,"'The Kirschner brothers, Don and Bill, named t...",K2,True
30,'California',Nixon,True
37,"'Ali, who married this man's daughter Fatima, ...",Muhammed,True
41,"'In 2003 this airline agreed to buy KLM, creat...",Air France,True
44,'This stiff silken fabric is favored for brida...,organza,True
52,'In 1534 he and his buddy Francis Xavier found...,(St. Ignatius) Loyola,True
59,'4 x 12',48,True


In [11]:
# Stratified sampling for the non-English stratum (1000 examples)
import math
from pathlib import Path

# Recompute q/a column lists safely (they were created by the non-English detection cell).
q_cols = [c for c in ['question','Question','question_text'] if c in df.columns]
a_cols = [c for c in ['answer','Answer'] if c in df.columns]

def make_noneng_stratum(row):
    """Create a readable label for non-English cause(s) for a row.
    Priority: explicit language code (if not 'en'), then diacritics, then low English token fraction.
    Falls back to 'other_non_english'."""
    parts = []
    # question-level signals (prefer question when present)
    if q_cols:
        q = q_cols[0]
        lang_col = 'q_' + q + '_lang'
        if lang_col in row.index and row.get(lang_col) and row.get(lang_col) != 'en':
            parts.append(str(row.get(lang_col)))
        diac_col = 'q_' + q + '_has_diacritics'
        if diac_col in row.index and row.get(diac_col):
            parts.append('diacritics')
        frac_col = 'q_' + q + '_eng_token_frac'
        if frac_col in row.index and (row.get(frac_col) is not None) and row.get(frac_col) < 0.5:
            parts.append('low_eng_frac')
    # answer-level signals (if question had none, this can add more info)
    if a_cols:
        a = a_cols[0]
        lang_col = 'a_' + a + '_lang'
        if lang_col in row.index and row.get(lang_col) and row.get(lang_col) != 'en':
            parts.append(str(row.get(lang_col)))
        diac_col = 'a_' + a + '_has_diacritics'
        if diac_col in row.index and row.get(diac_col):
            parts.append('diacritics')
        frac_col = 'a_' + a + '_eng_token_frac'
        if frac_col in row.index and (row.get(frac_col) is not None) and row.get(frac_col) < 0.5:
            parts.append('low_eng_frac')
    if not parts:
        return 'other_non_english'
    # prefer language code if present
    langs = [p for p in parts if len(p) >= 2 and p.isalpha() and p != 'diacritics' and p != 'low_eng_frac']
    if langs:
        return langs[0]
    # otherwise join unique parts
    return '+'.join(dict.fromkeys(parts))

# Build stratum labels for rows flagged non-English
if 'non_english_any' not in df.columns:
    raise KeyError("non_english_any column not found. Run non-English detection step first.")

noneng_candidates = df[df['non_english_any'] == True].copy()
total_candidates = len(noneng_candidates)
print(f'Total non-English candidates: {total_candidates}')

if total_candidates == 0:
    print('No non-English candidates found — cannot sample.')
else:
    noneng_candidates['noneng_stratum'] = noneng_candidates.apply(make_noneng_stratum, axis=1)
    counts = noneng_candidates['noneng_stratum'].value_counts().sort_values(ascending=False)
    print('\nNon-English stratum counts (top):')
    print(counts.head(30))

    # Proportional allocation to 1000 samples
    n_target = 1000
    props = (counts / counts.sum())
    initial_alloc = (props * n_target).astype(int)
    alloc = initial_alloc.to_dict()
    allocated = sum(alloc.values())
    remainder = n_target - allocated
    if remainder > 0:
        fractional = (props * n_target) - initial_alloc
        for label in fractional.sort_values(ascending=False).index[:remainder]:
            alloc[label] += 1

    # Cap allocation to available rows in each stratum
    for label in list(alloc.keys()):
        avail = int(counts.get(label, 0))
        if alloc[label] > avail:
            alloc[label] = avail

    allocated_after_cap = sum(alloc.values())
    remaining_needed = n_target - allocated_after_cap

    # Sample per stratum
    samples = []
    rng = 42
    for label, n_alloc in alloc.items():
        if n_alloc <= 0:
            continue
        pool = noneng_candidates[noneng_candidates['noneng_stratum'] == label]
        take = min(n_alloc, len(pool))
        samples.append(pool.sample(n=take, random_state=rng))

    noneng_sampled = pd.concat(samples) if samples else noneng_candidates.sample(n=0)

    # Fill any remaining slots from the leftover pool
    if remaining_needed > 0:
        remaining_pool = noneng_candidates.drop(noneng_sampled.index, errors='ignore')
        if len(remaining_pool) >= remaining_needed:
            noneng_sampled = pd.concat([noneng_sampled, remaining_pool.sample(n=remaining_needed, random_state=rng)])
        else:
            noneng_sampled = pd.concat([noneng_sampled, remaining_pool])

    # Final safety: if over target, truncate; if under, report actual size
    if len(noneng_sampled) > n_target:
        noneng_sampled = noneng_sampled.sample(n=n_target, random_state=rng)

    print(f'Final non-English sampled count: {len(noneng_sampled)} (target {n_target})')
    print('\nSampled non-English stratum distribution:')
    print(noneng_sampled['noneng_stratum'].value_counts())

    # Save to jsonl
    out_path = Path('non_english_stratum_sample_1000.jsonl')
    noneng_sampled.to_json(out_path, orient='records', lines=True, force_ascii=False)
    print('Wrote non-English stratum sample to', out_path)

    # Show a few examples for inspection
    display(noneng_sampled[[q_cols[0]] if q_cols else [] + ([a_cols[0]] if a_cols else []) + ['noneng_stratum']].head(10))

Total non-English candidates: 47142

Non-English stratum counts (top):
noneng_stratum
ur                         10819
it                          9545
sw                          5090
hi                          4649
pt                          3982
nl                          3540
de                          2575
tr                          2220
fr                          1430
pl                          1241
es                           936
diacritics                   369
el                           226
bg                           190
low_eng_frac                 189
ru                           112
th                            13
ar                            11
diacritics+low_eng_frac        2
vi                             2
zh                             1
Name: count, dtype: int64
Final non-English sampled count: 1000 (target 1000)

Sampled non-English stratum distribution:
noneng_stratum
ur              230
it              203
sw              108
hi               99
pt   

Unnamed: 0,question
138994,"'In 1889 this Elizabethport, New Jersey compan..."
74652,'Zapata's Plan of Ayala called for redistribut...
87168,'Term for the chief religious leader of a syna...
171920,'This winged horse carried thunderbolts for Zeus'
184518,'In 1972 this legendary soulman won an Oscar f...
33644,'Word that begins the Gettysburg Address'
61670,"'His novel ""Indignation"" follows a student fro..."
210120,'The Celtic town of Vindobona became a militar...
216842,"'She dedicated ""Jane Eyre"" to William Makepeac..."
193451,'The number of stars on the American flag'


In [None]:
print(noneng_sampled)


In [12]:
# Persist long-running results (run once after your heavy cell finished)
from pathlib import Path
import pandas as pd

out_dir = Path('artifacts')
out_dir.mkdir(exist_ok=True)

# Save full dataframe (fast, column-typed) and a pickle copy for exact reload
try:
    df.to_parquet(out_dir / 'jeopardy_with_features.parquet', index=False)
    df.to_pickle(out_dir / 'jeopardy_with_features.pkl')
    print('Saved full dataframe to parquet + pickle')
except Exception as e:
    print('Failed to save full dataframe (parquet/pickle):', e)

# Save JSONL (readable) sample of entire DF (first 5000 rows or fewer) to avoid huge files
try:
    df.head(5000).to_json(out_dir / 'jeopardy_head_5000.jsonl', orient='records', lines=True, force_ascii=False)
    print('Saved JSONL sample (first 5000 rows)')
except Exception as e:
    print('Failed to save JSONL sample:', e)

# Save metadata columns (q_/a_ prefixed) separately for quick loading
meta_cols = [c for c in df.columns if c.startswith('q_') or c.startswith('a_')]
if meta_cols:
    try:
        df[meta_cols].to_parquet(out_dir / 'jeopardy_meta.parquet', index=False)
        print('Saved meta columns to', out_dir / 'jeopardy_meta.parquet')
    except Exception as e:
        print('Failed to save meta columns:', e)

# Save the non-English sampled dataframe if available, else save derived non-English rows from df
if 'noneng_sampled' in globals():
    try:
        noneng_sampled.to_parquet(out_dir / 'noneng_sampled.parquet', index=False)
        noneng_sampled.to_json(out_dir / 'noneng_sampled.jsonl', orient='records', lines=True, force_ascii=False)
        print('Saved noneng_sampled to parquet + jsonl')
    except Exception as e:
        print('Failed to save noneng_sampled:', e)
else:
    if 'non_english_any' in df.columns and df['non_english_any'].any():
        try:
            tmp = df[df['non_english_any']].copy()
            tmp.to_parquet(out_dir / 'noneng_from_df.parquet', index=False)
            tmp.to_json(out_dir / 'noneng_from_df.jsonl', orient='records', lines=True, force_ascii=False)
            print('Saved non-English rows derived from df')
        except Exception as e:
            print('Failed to save non-English derived rows:', e)

# Small quick-check file for immediate inspection
try:
    df.sample(n=min(200, len(df)), random_state=42).to_json(out_dir / 'jeopardy_quick_sample.jsonl', orient='records', lines=True, force_ascii=False)
    print('Saved quick random sample')
except Exception as e:
    print('Failed to save quick sample:', e)

print('\nSaved artifacts in', out_dir.resolve())

# Quick load notes (run in a fresh session to restore):
# import pandas as pd
# df = pd.read_parquet('artifacts/jeopardy_with_features.parquet')
# noneng = pd.read_parquet('artifacts/noneng_sampled.parquet')  # if exists
# meta = pd.read_parquet('artifacts/jeopardy_meta.parquet')

Saved full dataframe to parquet + pickle
Saved JSONL sample (first 5000 rows)
Saved meta columns to artifacts\jeopardy_meta.parquet
Saved noneng_sampled to parquet + jsonl
Saved quick random sample

Saved artifacts in E:\Cliente\Desktop\Career\MotorolaSolutions_Case\artifacts


**Non-English With langid - Best Result. Final Strata**

In [14]:
# Langid-based non-English detection (efficient, tqdm progress, no heavy deps)
import sys, subprocess

# Install lightweight dependencies if missing
try:
    import langid
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'langid'])
    import langid
try:
    from tqdm import tqdm
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tqdm'])
    from tqdm import tqdm

# Helper: check for non-ASCII characters (diacritics) quickly
def has_diacritics(text: str) -> bool:
    if not text:
        return False
    return any(ord(ch) > 127 for ch in text)

# Safe accessor for text columns
def to_text_series(col_name):
    if col_name in df.columns:
        return df[col_name].fillna('').astype(str)
    return pd.Series([''] * len(df))

# Configure confidence threshold and progress
CONF_THRESH = 0.7
show_progress = True

# Process a single column with langid and diacritics check, return lists
def detect_column_non_english(series):
    langs = []
    confs = []
    non_eng_flags = []
    # use series.items() instead of series.iteritems() for compatibility
    it = series.items()
    if show_progress:
        it = tqdm(it, total=len(series), desc='Detecting language for column')
    for idx, text in it:
        if not text:
            langs.append(None); confs.append(0.0); non_eng_flags.append(False); continue
        try:
            pred_lang, score = langid.classify(text)
            lang = pred_lang.lower() if isinstance(pred_lang, str) else pred_lang
            conf = float(score) if score is not None else 0.0
        except Exception:
            lang, conf = None, 0.0
        diac = has_diacritics(text)
        is_non = (lang is not None and lang != 'en' and conf > CONF_THRESH) or diac
        langs.append(lang); confs.append(conf); non_eng_flags.append(bool(is_non))
    return langs, confs, non_eng_flags

# Apply to question and answer fields
question_series = to_text_series('question')
answer_series = to_text_series('answer')

print('Running langid on question column...')
q_langs, q_confs, q_non = detect_column_non_english(question_series)
df['question_lang'] = q_langs
df['question_conf'] = q_confs
df['question_non_english'] = q_non

print('Running langid on answer column...')
a_langs, a_confs, a_non = detect_column_non_english(answer_series)
df['answer_lang'] = a_langs
df['answer_conf'] = a_confs
df['answer_non_english'] = a_non

# Final combined flag
df['is_non_english'] = df['question_non_english'] | df['answer_non_english']

# Summary counts
total_non = int(df['is_non_english'].sum())
total_rows = len(df)
print(f'Non-English rows detected: {total_non} / {total_rows} ({{:.2%}})'.format(total_non/total_rows if total_rows else 0))
print('\nBreakdown:')
print('question_non_english:', int(df['question_non_english'].sum()))
print('answer_non_english:', int(df['answer_non_english'].sum()))

# Show a small sample of detected non-English rows for manual inspection
if total_non > 0:
    display(df[df['is_non_english']].head(10)[['question','answer','question_lang','question_conf','answer_lang','answer_conf','is_non_english']])
else:
    print('No non-English rows detected by langid+diacritics rule.')

Running langid on question column...


Detecting language for column: 100%|██████████| 195415/195415 [09:57<00:00, 327.00it/s]



Running langid on answer column...


Detecting language for column: 100%|██████████| 195415/195415 [06:46<00:00, 480.48it/s]



Non-English rows detected: 19374 / 195415 (9.91%)

Breakdown:
question_non_english: 1147
answer_non_english: 18336


Unnamed: 0,question,answer,question_lang,question_conf,answer_lang,answer_conf,is_non_english
1,'No. 2: 1912 Olympian; football star at Carlis...,Jim Thorpe,en,-164.037626,de,2.070063,True
3,"'In 1963, live on ""The Art Linkletter Show"", t...",McDonald\'s,en,-93.205609,et,1.000576,True
7,'No. 8: 30 steals for the Birmingham Barons; 2...,Michael Jordan,en,-99.123727,de,3.874381,True
13,"'No. 1: Lettered in hoops, football and lacros...",Jim Brown,en,-215.495155,de,2.070063,True
16,"'Outlaw: ""Murdered by a traitor and a coward w...",Jesse James,en,-220.958717,fr,0.910385,True
82,'These parts of a peach tree grow at nodes alo...,blossoms,en,-293.454914,sv,2.070393,True
90,'A 7.0 magnitude earthquake in this Caribbean ...,Haiti,en,-161.812871,fr,1.982057,True
123,"'She was ""The Untamed Heifer"" and ""The Virgin ...",Elizabeth I,en,-99.724927,es,1.46833,True
160,'Cook Islands',New Zealand,en,9.06184,de,1.281361,True
164,'This Chiricahua Apache was a popular attracti...,Geronimo,en,-172.537018,lt,1.233494,True


In [21]:
# Stratified sampling for non-English stratum using langid results
import math

def stratified_sample_non_english(df, n_samples=1000, lang_col='question_lang', random_state=42):
    """Robust stratified sampling of rows where df['is_non_english'] is True.
    Ensures final sample has exactly n_samples if that many candidates exist.
    """
    non_english_samples = df[df['is_non_english'] == True].copy()
    pool_size = len(non_english_samples)
    rng = random_state

    if pool_size == 0:
        print('No non-English candidates to sample from.')
        return non_english_samples

    # If fewer candidates than requested, return all shuffled
    if pool_size <= n_samples:
        print(f'Only {pool_size} non-English candidates available; returning all.')
        return non_english_samples.sample(n=pool_size, random_state=rng)

    # Prepare language column
    lang_series = non_english_samples[lang_col].fillna('unknown').astype(str)

    # Compute proportional allocation
    lang_counts = lang_series.value_counts()
    props = lang_counts / lang_counts.sum()

    # Initial integer allocation (floor)
    import numpy as np
    initial_alloc = (props * n_samples).astype(int)
    alloc = initial_alloc.to_dict()

    # Distribute remainder by largest fractional parts
    remainder = n_samples - sum(alloc.values())
    if remainder > 0:
        fractional = (props * n_samples) - initial_alloc
        for label in fractional.sort_values(ascending=False).index[:remainder]:
            alloc[label] += 1

    # Cap allocations to availability
    for label in list(alloc.keys()):
        avail = int(lang_counts.get(label, 0))
        if alloc[label] > avail:
            alloc[label] = avail

    # Sample per stratum according to alloc
    samples = []
    selected_idx = set()
    for label, n_alloc in alloc.items():
        if n_alloc <= 0:
            continue
        pool = non_english_samples[lang_series == label]
        take = min(n_alloc, len(pool))
        if take > 0:
            s = pool.sample(n=take, random_state=rng)
            samples.append(s)
            selected_idx.update(s.index.tolist())

    sampled = pd.concat(samples) if samples else non_english_samples.sample(n=0)

    # If we are short, fill from remaining non-selected candidates
    cur_n = len(sampled)
    if cur_n < n_samples:
        remaining_needed = n_samples - cur_n
        remaining_pool = non_english_samples.drop(index=list(selected_idx), errors='ignore')
        if len(remaining_pool) > 0:
            take = min(remaining_needed, len(remaining_pool))
            sampled = pd.concat([sampled, remaining_pool.sample(n=take, random_state=rng)])
            selected_idx.update(sampled.index.tolist())

    # Final safety: if over, truncate; if still short, report (shouldn't happen unless pool < n_samples)
    if len(sampled) > n_samples:
        sampled = sampled.sample(n=n_samples, random_state=rng)
    sampled = sampled.reset_index(drop=True)

    return sampled

# Run improved sampler and save
non_english_sampled = stratified_sample_non_english(df, n_samples=1000, lang_col='question_lang', random_state=42)
print(f'Sampled {len(non_english_sampled)} non-English rows for stratum (target 1000).')
if not non_english_sampled.empty and 'question_lang' in non_english_sampled.columns:
    print('Distribution by language (top):')
    print(non_english_sampled['question_lang'].value_counts().head(20))

out_path = Path('non_english_stratum_sample_langid_1000.jsonl')
non_english_sampled.to_json(out_path, orient='records', lines=True, force_ascii=False)
print('Wrote non-english stratum sample to', out_path)
display(non_english_sampled.head(10))

Sampled 1000 non-English rows for stratum (target 1000).
Distribution by language (top):
question_lang
en    965
de      8
es      7
fr      5
it      4
pl      2
sv      1
da      1
nl      1
id      1
lt      1
mt      1
et      1
fi      1
sl      1
Name: count, dtype: int64
Wrote non-english stratum sample to non_english_stratum_sample_langid_1000.jsonl


Unnamed: 0,category,air_date,question,value,answer,round,show_number,q_question_has_digits,q_question_has_ordinals,q_question_has_year,...,question_lang,question_conf,question_non_english,answer_lang,answer_conf,answer_non_english,is_non_english,question_has_unusual_proper_noun,answer_has_unusual_proper_noun,is_unusual_proper_noun
0,OLYMPIC ODDITIES,2011-02-14,'Milorad Cavic almost upset this man's perfect...,$200,Michael Phelps,Jeopardy!,6086,True,False,True,...,en,-173.943948,False,de,3.874381,True,True,True,False,True
1,PROVERBS,1998-06-15,"'It ""seldom knocks twice"", so make the most of...",$300,Opportunity,Jeopardy!,3191,False,False,False,...,en,-61.595225,False,de,1.967645,True,True,False,False,False
2,A GOOD LAUGH,1998-07-02,"'Amused actor seen here in 1947's ""Kiss of Dea...",$800,Richard Widmark,Double Jeopardy!,3204,True,False,True,...,en,-87.22795,False,de,3.874381,True,True,True,True,True
3,THE OLYMPIC GAMES,2002-12-03,"'The Olympic motto is ""Citius, Altius, Fortius...","$1,200",Stronger,Double Jeopardy!,4202,False,False,False,...,en,-115.005118,False,da,1.301304,True,True,True,True,True
4,HOOP OF THE DAY,2004-10-19,"'At 7'6"", this center from China would be a bi...",$400,Yao Ming,Double Jeopardy!,4627,True,False,False,...,en,-235.160194,False,pt,0.85246,True,True,False,False,False
5,FOOD,1996-09-05,"'Originally from Naples, it's an Italian turno...",$1000,a calzone,Double Jeopardy!,2759,False,False,False,...,en,-349.751002,False,es,1.14003,True,True,False,False,False
6,RODNEY DANGERFIELD,2005-04-19,"'In a rare dramatic turn, Rodney starred as an...",$1200,Natural Born Killers,Double Jeopardy!,4757,False,False,False,...,en,-167.740351,False,fr,1.887248,True,True,False,False,False
7,HITCHCOCK'S BLONDES,1992-11-16,"'Glamour girl whose femme fatale role in ""Vert...",$400,Kim Novak,Jeopardy!,1886,False,False,False,...,en,-96.198303,False,de,2.070063,True,True,True,False,True
8,TV THE KIDS LOVE,1999-03-05,'David Boreanaz plays this sensitive and torme...,$400,Angel,Jeopardy!,3345,False,False,False,...,en,-244.259842,False,de,1.672035,True,True,True,False,True
9,GEOLOGY,1997-04-16,'A moraine is the rocky material left behind b...,$800,a glacier,Double Jeopardy!,2918,False,False,False,...,en,-122.40207,False,es,0.841662,True,True,False,False,False


In [None]:
print(non_english_sampled)

### Unusual Proper Nouns

In [17]:
# Detect unusual proper nouns (rare named entities) in question and answer columns
import re
from collections import Counter
from tqdm import tqdm

# Regex to capture capitalized words like 'London', 'Einstein' (not acronyms)
proper_re = re.compile(r"\b([A-Z][a-z]+)\b")

# Small ignore lists for common capitalized words
COMMON_IGNORE = {
    # months
    'January','February','March','April','May','June','July','August','September','October','November','December',
    # weekdays
    'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday',
    # common proper-like words
    'The','A','An','I','Mr','Mrs','Ms','Dr','St'
}

def has_diacritics(text: str) -> bool:
    return any(ord(c) > 127 for c in text) if text else False

# Prepare safe text series
q_series = df['question'].fillna('').astype(str) if 'question' in df.columns else pd.Series([''] * len(df))
a_series = df['answer'].fillna('').astype(str) if 'answer' in df.columns else pd.Series([''] * len(df))

# First pass: count frequencies across entire corpus (both Q and A)
counter = Counter()
print('Counting capitalized tokens across corpus...')
for text in tqdm(pd.concat([q_series, a_series]), total=len(q_series)+len(a_series)):
    for tok in proper_re.findall(text):
        # ignore if in common ignore list or looks like acronym (all upper) - pattern won't match acronyms but keep check
        if tok in COMMON_IGNORE:
            continue
        if tok.isupper():
            continue
        counter[tok] += 1

print('Unique capitalized tokens found:', len(counter))

# Define rarity threshold (<= threshold considered unusual). Tune as needed.
RARITY_THRESHOLD = 5

# Helper to check if text contains any unusual proper nouns
def contains_unusual_proper(text: str, counter: Counter, threshold: int) -> bool:
    if not text:
        return False
    # If any token has diacritics, flag immediately
    if has_diacritics(text):
        return True
    for tok in proper_re.findall(text):
        if tok in COMMON_IGNORE:
            continue
        if tok.isupper():
            continue
        freq = counter.get(tok, 0)
        if freq <= threshold:
            return True
    return False

# Apply per-row with tqdm; collect boolean lists then assign columns (avoids per-row df.setitem overhead)
print('Scanning rows to mark unusual proper nouns in question...')
q_flags = []
for text in tqdm(q_series, total=len(q_series), desc='question scan'):
    q_flags.append(contains_unusual_proper(text, counter, RARITY_THRESHOLD))

print('Scanning rows to mark unusual proper nouns in answer...')
a_flags = []
for text in tqdm(a_series, total=len(a_series), desc='answer scan'):
    a_flags.append(contains_unusual_proper(text, counter, RARITY_THRESHOLD))

# Assign boolean columns
df['question_has_unusual_proper_noun'] = q_flags
df['answer_has_unusual_proper_noun'] = a_flags
df['is_unusual_proper_noun'] = df['question_has_unusual_proper_noun'] | df['answer_has_unusual_proper_noun']

# Summaries and examples
total_q = int(df['question_has_unusual_proper_noun'].sum())
total_a = int(df['answer_has_unusual_proper_noun'].sum())
total_any = int(df['is_unusual_proper_noun'].sum())
print(f'Rows with unusual proper noun in question: {total_q}')
print(f'Rows with unusual proper noun in answer:   {total_a}')
print(f'Rows with any unusual proper noun:         {total_any}')

print('\nTop 30 rare tokens (count <=', RARITY_THRESHOLD, '):')
rare_tokens = [tok for tok, cnt in counter.items() if cnt <= RARITY_THRESHOLD]
rare_sample = sorted(rare_tokens, key=lambda t: counter[t])[:30]
print(rare_sample)

print('\nExample rows flagged (first 10):')
if total_any > 0:
    display(df[df['is_unusual_proper_noun']].head(10)[['question','answer','question_has_unusual_proper_noun','answer_has_unusual_proper_noun']])
else:
    print('No rows flagged as unusual proper nouns.')

Counting capitalized tokens across corpus...


100%|██████████| 390830/390830 [00:03<00:00, 127078.72it/s]


Unique capitalized tokens found: 55801
Scanning rows to mark unusual proper nouns in question...


question scan: 100%|██████████| 195415/195415 [00:06<00:00, 31918.68it/s]


Scanning rows to mark unusual proper nouns in answer...


answer scan: 100%|██████████| 195415/195415 [00:01<00:00, 129604.61it/s]


Rows with unusual proper noun in question: 45506
Rows with unusual proper noun in answer:   16340
Rows with any unusual proper noun:         58008

Top 30 rare tokens (count <= 5 ):
['Signer', 'Cholas', 'Lettered', 'Cleats', 'Ifrane', 'Kirschner', 'Tedder', 'Chula', 'Piha', 'Corollas', 'Ashlyn', 'Seferovic', 'Heene', 'Mullaney', 'Untamed', 'Heifer', 'Miquelon', 'Hamnett', 'Yamasaki', 'Pereira', 'Gyo', 'Obata', 'Skidmore', 'Ronaldo', 'Nazario', 'Samsonov', 'Sliwa', 'Zionists', 'Lamu', 'Dye']

Example rows flagged (first 10):


Unnamed: 0,question,answer,question_has_unusual_proper_noun,answer_has_unusual_proper_noun
4,"'Signer of the Dec. of Indep., framer of the C...",John Adams,True,False
9,'This housewares store was named for the packa...,Crate & Barrel,False,True
12,'In 1000 Rajaraja I of the Cholas battled to t...,Ceylon (or Sri Lanka),True,False
13,"'No. 1: Lettered in hoops, football and lacros...",Jim Brown,True,False
15,"'This company's Accutron watch, introduced in ...",Bulova,True,True
16,"'Outlaw: ""Murdered by a traitor and a coward w...",Jesse James,True,False
19,'No. 10: FB/LB for Columbia U. in the 1920s; M...,(Lou) Gehrig,True,False
20,'Africa's lowest temperature was 11 degrees be...,Morocco,True,False
21,'Edward Teller and this man partnered in 1898 ...,(Paul) Bonwit,False,True
26,"'The Kirschner brothers, Don and Bill, named t...",K2,True,False


In [18]:
# Stratified sampling using the most usual proper noun stratum (1000 examples)
import math
def make_proper_stratum(row):
    """Create a readable label for unusual proper noun cause(s) for a row."""
    parts = []
    if row.get('question_has_unusual_proper_noun'):
        parts.append('question')
    if row.get('answer_has_unusual_proper_noun'):
        parts.append('answer')
    return '+'.join(parts)
if 'is_unusual_proper_noun' not in df.columns:
    raise KeyError("is_unusual_proper_noun column not found. Run unusual proper noun detection step first.")
proper_candidates = df[df['is_unusual_proper_noun'] == True].copy()
total_candidates = len(proper_candidates)
print(f'Total unusual proper noun candidates: {total_candidates}')
if total_candidates == 0:
    print('No unusual proper noun candidates found — cannot sample.')
else:
    proper_candidates['proper_stratum'] = proper_candidates.apply(make_proper_stratum, axis=1)
    counts = proper_candidates['proper_stratum'].value_counts().sort_values(ascending=False)
    print('\nUnusual proper noun stratum counts (top):')
    print(counts.head(10))
    n_target = 1000
    props = (counts / counts.sum())
    initial_alloc = (props * n_target).astype(int)
    alloc = initial_alloc.to_dict()
    allocated = sum(alloc.values())
    remainder = n_target - allocated
    if remainder > 0:
        fractional = (props * n_target) - initial_alloc
        for label in fractional.sort_values(ascending=False).index[:remainder]:
            alloc[label] += 1
    for label in list(alloc.keys()):
        avail = int(counts.get(label, 0))
        if alloc[label] > avail:
            alloc[label] = avail
    allocated_after_cap = sum(alloc.values())
    remaining_needed = n_target - allocated_after_cap
    samples = []
    rng = 42
    for label, n_alloc in alloc.items():
        if n_alloc <= 0:
            continue
        pool = proper_candidates[proper_candidates['proper_stratum'] == label]
        take = min(n_alloc, len(pool))
        samples.append(pool.sample(n=take, random_state=rng))
    proper_df = pd.concat(samples) if samples else proper_candidates.sample(n=0)
    if remaining_needed > 0:
        remaining_pool = proper_candidates.drop(proper_df.index, errors='ignore')
        if len(remaining_pool) >= remaining_needed:
            proper_df = pd.concat([proper_df, remaining_pool.sample(n=remaining_needed, random_state=rng)])
        else:
            proper_df = pd.concat([proper_df, remaining_pool])
    if len(proper_df) > n_target:
        proper_df = proper_df.sample(n=n_target, random_state=rng)
    print(f'Final unusual proper noun sampled count: {len(proper_df)} (target {n_target})')
    print('\nSampled unusual proper noun stratum distribution:')
    print(proper_df['proper_stratum'].value_counts())
    out_path = Path('unusual_proper_noun_stratum_sample_1000.jsonl')
    proper_df.to_json(out_path, orient='records', lines=True, force_ascii=False)
    print('Wrote sample to', out_path)
    display(proper_df.head(10))

Total unusual proper noun candidates: 58008

Unusual proper noun stratum counts (top):
proper_stratum
question           41668
answer             12502
question+answer     3838
Name: count, dtype: int64
Final unusual proper noun sampled count: 1000 (target 1000)

Sampled unusual proper noun stratum distribution:
proper_stratum
question           718
answer             216
question+answer     66
Name: count, dtype: int64
Wrote sample to unusual_proper_noun_stratum_sample_1000.jsonl

Unusual proper noun stratum counts (top):
proper_stratum
question           41668
answer             12502
question+answer     3838
Name: count, dtype: int64
Final unusual proper noun sampled count: 1000 (target 1000)

Sampled unusual proper noun stratum distribution:
proper_stratum
question           718
answer             216
question+answer     66
Name: count, dtype: int64
Wrote sample to unusual_proper_noun_stratum_sample_1000.jsonl


Unnamed: 0,category,air_date,question,value,answer,round,show_number,q_question_has_digits,q_question_has_ordinals,q_question_has_year,...,question_conf,question_non_english,answer_lang,answer_conf,answer_non_english,is_non_english,question_has_unusual_proper_noun,answer_has_unusual_proper_noun,is_unusual_proper_noun,proper_stratum
210611,A HAPPY TUNE,2010-10-19,'One of 1988's simple pleasures was this no. 1...,$200,"""Don\'t Worry, Be Happy""",Jeopardy!,6002,True,False,True,...,-190.249252,False,en,9.06184,False,False,True,False,True,question
49849,AUTHORS' NATIVE LANDS,2001-02-15,"'Writer of ""Ficciones"" Jorge Luis Borges'",$1000,Argentina,Double Jeopardy!,3794,False,False,False,...,-69.465963,False,en,9.06184,False,False,True,False,True,question
70349,THE OLD TESTAMENT,2000-12-08,'Job lived in Uz; Abraham lived in this city o...,$800,Ur,Double Jeopardy!,3745,False,False,False,...,-188.387556,False,en,9.06184,False,False,True,False,True,question
47467,SEWING,1995-07-05,'Thread made of this fiber is often mercerized...,$200,cotton,Jeopardy!,2513,False,False,False,...,-234.51937,False,en,9.06184,False,False,True,False,True,question
83836,HISTORY ART,2002-12-20,"'Delacroix, like Byron, sided with this people...",$1600,Greeks,Double Jeopardy!,4215,False,False,False,...,-233.210216,False,en,9.06184,False,False,True,False,True,question
66723,"""MILL""S",2009-06-15,"'Yeehaw! She choreographed the ballet ""Rodeo""'",$1200,Agnes de Mille,Double Jeopardy!,5716,False,False,False,...,-58.209929,False,fr,-28.579963,False,False,True,False,True,question
21497,DRIVING THE GREEN,2004-11-19,'The Altra EV from this company that also make...,$600,Nissan,Jeopardy!,4650,True,False,False,...,-217.041484,False,fr,0.394857,False,False,True,False,True,question
203237,"OH, NO! IT'S OPERA!",2002-05-02,"'This composer of ""Cavalleria Rusticana"" wrote...",$1200,Mascagni,Double Jeopardy!,4079,False,False,False,...,-238.281725,False,en,9.06184,False,False,True,False,True,question
177706,FIRST-NAME BASIS,1998-03-13,'Cecil B.'s niece could tell you that Inez is ...,$400,Agnes,Double Jeopardy!,3125,False,False,False,...,-222.59706,False,en,9.06184,False,False,True,False,True,question
76158,THE ROARING '20s,2000-11-22,"'Seen here in 1921, 16-year-old Margaret Gorma...",$100,Miss America,Jeopardy!,3733,True,False,True,...,-164.557791,False,en,9.06184,False,False,True,False,True,question


In [None]:
print(proper_df)