In [1]:
import json

dataset_path = "../data/JEOPARDY_QUESTIONS1.json"

with open(dataset_path, "r") as file:
    data = json.load(file)

    print(f"Total: {len(data)}")
    print(data[0])

Total: 216930
{'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}


In [8]:
import re

def contain_numbers(phrase):
    return bool(re.search(r'\d', phrase))

sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}
print ({contain_numbers(sample['question'])})

{True}


In [11]:
from nltk.corpus import words
from nltk.tokenize import word_tokenize

import nltk
nltk.download('words')
nltk.download('punkt_tab')

english_words = set(words.words())

def contain_non_english(phrase):
    tokens = word_tokenize(phrase)
    non_english = [word for word in tokens if word.isalpha() and word.lower() not in english_words]
    return len(non_english) > 0

sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}
print ({contain_non_english(sample['question'])})

{True}


[nltk_data] Downloading package words to /Users/erinc/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/erinc/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [28]:
import spacy
import requests

nlp = spacy.load("en_core_web_md")

def query_wikidata_rarity(noun_tobecheck):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "language": "en",
        "format": "json",
        "search": noun_tobecheck,
    }
    response = requests.get(url, params=params)
    data = response.json()

    if not data.get("search"):
        return True

    for result in data["search"]:
        description =  result.get("description", "").lower()
        if any(keyword in description for keyword in ["person", "city", "state", "country", "organization"]):
            return False
        if result.get("sitelinks", 0) > 50:
            return False
    return True

def contain_unusual_proper_nouns(phrase):
    doc = nlp(phrase)

    for sent in doc.sents:
        for ent in sent.ents:
            if ent.label_ in {"PERSON", "GPE", "ORG", "LOC", "FAC", "EVENT", "WORK_OF_ART", "LAW"} and query_wikidata_rarity(ent.text):
                return True
    return False
                
    '''
    PERSON - Person's names, 
    GPE - Geographic locations, 
    ORG - Organizations, 
    LOC - Non-GPE locations, 
    FAC - Facilities, 
    EVENT - Named events, 
    WORK_OF_ART - Titles of books, songs, movies, paintings, 
    LAW - Named documents made into laws.
    '''

sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}

print ({contain_unusual_proper_nouns(sample['question'])})

{False}
