In [15]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tokenize import MWETokenizer

In [12]:
samples = [
    'Some random sentence about a country',
    'I am talking about Portugal',
    'Portuguese people always arrive late',
    'John doesn\'t think Spain is good for vacation',
    'This was easier than I thought...',
    'Now a Saudi Arabia comment',
    'With a Saudi Arabian nationality notation'
]

In [19]:
for sentence in samples:
    ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
    iob_tagged = tree2conlltags(ne_tree)
    mwe_tokenizer = MWETokenizer([('Saudi', 'Arabia'), ('Saudi', 'Arabian')])
    print(mwe_tokenizer.tokenize(iob_tagged))

[('Some', 'DT', 'O'), ('random', 'JJ', 'O'), ('sentence', 'NN', 'O'), ('about', 'IN', 'O'), ('a', 'DT', 'O'), ('country', 'NN', 'O')]
[('I', 'PRP', 'O'), ('am', 'VBP', 'O'), ('talking', 'VBG', 'O'), ('about', 'IN', 'O'), ('Portugal', 'NNP', 'B-GPE')]
[('Portuguese', 'JJ', 'B-GPE'), ('people', 'NNS', 'O'), ('always', 'RB', 'O'), ('arrive', 'VBP', 'O'), ('late', 'JJ', 'O')]
[('John', 'NNP', 'B-PERSON'), ('does', 'VBZ', 'O'), ("n't", 'RB', 'O'), ('think', 'VB', 'O'), ('Spain', 'NNP', 'B-GPE'), ('is', 'VBZ', 'O'), ('good', 'JJ', 'O'), ('for', 'IN', 'O'), ('vacation', 'NN', 'O')]
[('This', 'DT', 'O'), ('was', 'VBD', 'O'), ('easier', 'JJR', 'O'), ('than', 'IN', 'O'), ('I', 'PRP', 'O'), ('thought', 'VBD', 'O'), ('...', ':', 'O')]
[('Now', 'RB', 'O'), ('a', 'DT', 'O'), ('Saudi', 'NNP', 'B-GPE'), ('Arabia', 'NNP', 'I-GPE'), ('comment', 'NN', 'O')]
[('With', 'IN', 'O'), ('a', 'DT', 'O'), ('Saudi', 'NNP', 'B-GPE'), ('Arabian', 'NNP', 'B-PERSON'), ('nationality', 'NN', 'O'), ('notation', 'NN', 'O'

In [8]:
# Refer to 'https://spacy.io/usage/linguistic-features' for more info
import spacy

In [9]:
nlp = spacy.load('en')

In [74]:
samples = [
    'Some random sentence about a country',
    'I am talking about Portugal',
    'Portuguese people always arrive late',
    'John doesn\'t think Spain is good for vacation',
    'This was easier than I thought...',
    'Now a Saudi Arabia comment',
    'With a Saudi Arabian nationality notation, along with religious catholic references'
]

In [17]:
def print_diff(text):

    def print_(title, text):
        doc = nlp(text)
        print('\n[{}]'.format(title))
        print('Text: {}'.format(text))
        print('Tokens: {}'.format([(token.text, token.pos_) for token in doc]))
        print('Entities: {}'.format([(ent.text, ent.label_) for ent in doc.ents]))
    
    print_('Raw', text)
    print_('Title', text.title())
    print_('Lower', text.lower())
    print_('Upper', text.upper())
        
print_diff('Government')


[Raw]
Text: Government
Tokens: [('Government', 'NOUN')]
Entities: []

[Title]
Text: Government
Tokens: [('Government', 'NOUN')]
Entities: []

[Lower]
Text: government
Tokens: [('government', 'NOUN')]
Entities: []

[Upper]
Text: GOVERNMENT
Tokens: [('GOVERNMENT', 'PROPN')]
Entities: []


In [53]:
def get_countries_from_content(text):
    # Create a document with the 'title' of the text
    # The title usage is to make sure the NER finds weirdly formatted Entities
    document_title = nlp(text.title())
    document_raw = nlp(text)
    
    # Define interesting entities to keep
    target_entities = ['GPE', 'NORP']
    content_entities = [(entity.text, entity.label_) for entity in document.ents if (entity.label_ in target_entities)]
    return content_entities