In [10]:
import nltk
from nltk.corpus import words, names, wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree

nltk.download('words')
nltk.download('names')
nltk.download('punkt')
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

def contain_non_english(phrase):
    english_words = set(words.words()) | set(wordnet.words())
    english_names = set(names.words())
    
    tokens = word_tokenize(phrase)
    
    tagged_tokens = pos_tag(tokens)
    named_entities = set()
    
    for chunk in ne_chunk(tagged_tokens):
        if isinstance(chunk, Tree): 
            named_entity = " ".join(c[0] for c in chunk)
            named_entities.add(named_entity.lower())

    non_english = [
        word for word in tokens 
        if word.isalpha() 
        and word.lower() not in english_words 
        and word.lower() not in english_names 
        and word.lower() not in named_entities
    ]
    print("Tokens:", tokens)
    print("Named Entities:", named_entities)
    print("Non-English Words:", non_english)
    return len(non_english) > 0


sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "For the last 8 years of his life, was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}
print (contain_non_english(sample['question']))


[nltk_data] Downloading package words to /Users/erinc/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to /Users/erinc/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package punkt to /Users/erinc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/erinc/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/erinc/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/erinc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokens: ['For', 'the', 'last', '8', 'years', 'of', 'his', 'life', ',', 'was', 'under', 'house', 'arrest', 'for', 'espousing', 'this', 'man', "'s", 'theory', "'"]
Named Entities: set()
Non-English Words: ['espousing']
True


In [17]:
from py3langid.langid import LanguageIdentifier, MODEL_FILE


def contain_non_english(phrase):
    identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
    tokens = phrase.split()

    non_english = [
        word for word in tokens
        if identifier.classify(word)[0] != 'en'
    ]

    print("Tokens:", tokens)
    print("Non-English Words:", non_english)

    return len(non_english) > 0
    
sample = {'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "For the last 8 years of his life, was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}
print (contain_non_english(sample['question']))

Tokens: ['For', 'the', 'last', '8', 'years', 'of', 'his', 'life,', 'was', 'under', 'house', 'arrest', 'for', 'espousing', 'this', "man's", "theory'"]
Non-English Words: ['espousing']
True
