In [1]:
# Refer to 'https://spacy.io/usage/linguistic-features' for more info
import spacy
import re
import pandas as pd

# Load English language for Spacy analysis
nlp = spacy.load('en')

# Compile alpha regex expression
alpha_regex = re.compile('[^a-zA-Z]')

# Paths for the matching countries, nationalities, religions and currencies
parsed_country_nationality_file = 'data/parsed/parsed_country_nationality.csv'
parsed_currency_country_file = 'data/parsed/parsed_currency_country.csv'
parsed_country_religion_file = 'data/parsed/country_religion_files/parsed_country_religion.csv'
parsed_country_cities_file = 'data/parsed/parsed_country_cities.csv'

# Load the necessary datasets
country_nationality_df = pd.read_csv(parsed_country_nationality_file, encoding='utf-8', compression='gzip', index_col=False)
currency_country_df = pd.read_csv(parsed_currency_country_file, encoding='utf-8', compression='gzip', index_col=False)
country_religion_df = pd.read_csv(parsed_country_religion_file, encoding='utf-8', compression='gzip', index_col=False)
country_cities_df = pd.read_csv(parsed_country_cities_file, encoding='utf-8', compression='gzip', index_col=False)

In [2]:
currency_country_df.head()

Unnamed: 0,ID,Countries
0,AED,['AE']
1,AFN,['AF']
2,ALL,['AL']
3,AMD,['AM']
4,ANG,"['CW', 'SX']"


In [3]:
'Portugal' in country_nationality_df['Common Name'].astype(str)

False

In [4]:
samples = [
    'Some random sentence about a country',
    'I am talking about Portugal',
    'Portuguese people always arrive late',
    'John doesn\'t think Spain is good for vacation',
    'This was easier than I thought...',
    'Now a Saudi Arabia comment',
    'With a Saudi Arabian nationality notation, along with religious catholic references'
]

In [5]:
def print_diff(text):

    def print_(title, text):
        doc = nlp(text)
        print('\n[{}]'.format(title))
        print('Text: {}'.format(text))
        print('Tokens: {}'.format([(token.text, token.pos_) for token in doc]))
        print('Entities: {}'.format([(ent.text, ent.label_) for ent in doc.ents]))
    
    print_('Raw', text)
    print_('Title', text.title())
    print_('Lower', text.lower())
    print_('Upper', text.upper())
        
print_diff('Government')


[Raw]
Text: Government
Tokens: [('Government', 'NOUN')]
Entities: []

[Title]
Text: Government
Tokens: [('Government', 'NOUN')]
Entities: []

[Lower]
Text: government
Tokens: [('government', 'NOUN')]
Entities: []

[Upper]
Text: GOVERNMENT
Tokens: [('GOVERNMENT', 'PROPN')]
Entities: []


In [8]:
def get_result_country_probability_dict(result_row, result_label):
    
    country_probability_dict = {}
    
    # Interpret country results
    if result_label == 'Country':
        country_code = result_row['ID']
        country_probability_dict[country_code] = 1.0
    
    # Interpret nationality results
    elif result_label == 'Nationality':
        country_code = result_row['ID']
        country_probability_dict[country_code] = 1.0
        
    # Interpret religion results
    elif result_label == 'Religion':
        for country_code, country_prob in result_row.drop(['Religion', 'Affiliation']).iteritems():
            if country_prob is not float('NaN'):
                country_probability_dict[country_code] = float(country_prob)
    
    # Interpret currency results
    elif result_label == 'Currency':
        for country_code in result_row['Countries']:
            country_probability_dict[country_code] = 1 / len(result_row['Countries'])
    
    return country_probability_dict

def get_matching_row(text, df, col_labels):
    upper_text = alpha_regex.sub('', text).upper()
    for col_label in col_labels:
        upper_series = df[col_label].astype(str).apply(lambda x: x.upper())
        matching_df = df[upper_series.str.contains(upper_text)]
        if (len(matching_df) > 0):
            return matching_df.iloc[0]
    return None

def get_likely_results(label, text):
    
    result = None
    
    # Check if it is a country
    if label == 'GPE':
        result = get_matching_row(text, country_nationality_df, ['ID', 'Official Name', 'Common Name'])
        if result is not None:
            print('GPE -> Country!')
            return get_result_country_probability_dict(result, 'Country')
    
    # Check if it is a nationality/religion
    elif label == 'NORP':
        # Nationality check
        result = get_matching_row(text, country_nationality_df, ['Nationality'])
        if result is not None:
            print('NORP -> Nationality!')
            return get_result_country_probability_dict(result, 'Nationality')
        
        # Religion check
        result = get_matching_row(text, country_religion_df, ['Religion', 'Affiliation'])
        if result is not None:
            print('NORP -> Religion!')
            return get_result_country_probability_dict(result, 'Religion')
    
    # Check if it is a known currency
    elif label == 'MONEY':
        result = get_matching_row(text, currency_country_df, ['ID'])
        if result is not None:
            print('MONEY -> Currency')
            return get_result_country_probability_dict(result, 'Currency')
            
    return None

def get_interesting_text_entities(text):
    # TODO create as many texts as necessary, with differently formatted
    # text, in order to more accurately find countries.
    target_entities = ['GPE', 'NORP', 'MONEY']
    document = nlp(text)
    return [(entity.label_, entity.text) for entity in document.ents if (entity.label_ in target_entities)]

def get_countries_from_content(text):
    
    interesting_entities = get_interesting_text_entities(text)
    for ent_label, ent_text in interesting_entities:
        print('Matching {} -> {}...'.format(ent_label, ent_text))
        results = get_likely_results(ent_label, ent_text)
        if results is not None:
            print(results)
        else:
            print('unknown')
    
    return interesting_entities

In [12]:
get_countries_from_content('Portugal Portugal Portugal Spain, Christianity BGP 67')

Matching GPE -> Portugal...
GPE -> Country!
{'PT': 1.0}
Matching GPE -> Portugal...
GPE -> Country!
{'PT': 1.0}
Matching GPE -> Portugal...
GPE -> Country!
{'PT': 1.0}
Matching GPE -> Spain...
GPE -> Country!
{'ES': 1.0}


[('GPE', 'Portugal'),
 ('GPE', 'Portugal'),
 ('GPE', 'Portugal'),
 ('GPE', 'Spain')]