In [None]:
import re

def contain_numbers(phrase):
    return bool(re.search(r'\d', phrase))

sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}
print (contain_numbers(sample['question']))

In [None]:
import nltk
from nltk.corpus import words, names
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree

nltk.download('words')
nltk.download('names')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')

def contain_non_english(phrase):
    english_words = set(words.words())
    english_names = set(names.words())
    
    tokens = word_tokenize(phrase)
    
    tagged_tokens = pos_tag(tokens)
    named_entities = set()
    
    for chunk in ne_chunk(tagged_tokens):
        if isinstance(chunk, Tree): 
            named_entity = " ".join(c[0] for c in chunk)
            named_entities.add(named_entity.lower())

    non_english = [
        word for word in tokens 
        if word.isalpha() 
        and word.lower() not in english_words 
        and word.lower() not in english_names 
        and word.lower() not in named_entities
    ]
    
    return len(non_english) > 0


sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "For the last 8 years of his life, was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}
print (contain_non_english(sample['question']))


In [None]:
import spacy
import requests

nlp = spacy.load("en_core_web_md")

def query_wikidata_rarity(noun_tobecheck):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "language": "en",
        "format": "json",
        "search": noun_tobecheck,
    }
    response = requests.get(url, params=params)
    data = response.json()

    if not data.get("search"):
        return True

    for result in data["search"]:
        description =  result.get("description", "").lower()
        if any(keyword in description for keyword in ["person", "city", "state", "country", "organization"]):
            return False
        if result.get("sitelinks", 0) > 50:
            return False
    return True

def contain_unusual_proper_nouns(phrase):
    doc = nlp(phrase)

    for sent in doc.sents:
        for ent in sent.ents:
            if ent.label_ in {"PERSON", "GPE", "ORG", "LOC", "FAC", "EVENT", "WORK_OF_ART", "LAW"} and query_wikidata_rarity(ent.text):
                return True
    return False
                
    '''
    PERSON - Person's names, 
    GPE - Geographic locations, 
    ORG - Organizations, 
    LOC - Non-GPE locations, 
    FAC - Facilities, 
    EVENT - Named events, 
    WORK_OF_ART - Titles of books, songs, movies, paintings, 
    LAW - Named documents made into laws.
    '''

sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}

print ({contain_unusual_proper_nouns(sample['question'])})


In [None]:


def filter_and_pick(data, filter_func, n=1000):
    picked = []
    for record in data:
        if filter_func(record['question']) or filter_func(record['answer']):
            picked.append(record)
        if len(picked) >= n:
            break
    return picked

subset_numbers = filter_and_pick(data, contains_numbers, n=1000)
subset_non_english = filter_and_pick(data, contain_non_english, n=1000)
subset_unusual_proper_nouns = filter_and_pick(data, contain_unusual_proper_nouns, n=1000)