In [1]:
# Refer to 'https://spacy.io/usage/linguistic-features' for more info
import spacy
import re
import math
import pandas as pd

# Load English language for Spacy analysis
nlp = spacy.load('en')

# Compile alpha regex expression
alpha_regex = re.compile('[^a-zA-Z]')

# Paths for the matching countries, nationalities, religions and currencies
parsed_country_nationality_file = 'data/parsed/parsed_country_nationality.csv'
parsed_currency_country_file = 'data/parsed/parsed_currency_country.csv'
parsed_country_religion_file = 'data/parsed/country_religion_files/parsed_country_religion.csv'
parsed_country_cities_file = 'data/parsed/parsed_country_cities.csv'

# Load the necessary datasets
country_nationality_df = pd.read_csv(parsed_country_nationality_file, encoding='utf-8', compression='gzip', index_col=False)
currency_country_df = pd.read_csv(parsed_currency_country_file, encoding='utf-8', compression='gzip', index_col=False)
country_religion_df = pd.read_csv(parsed_country_religion_file, encoding='utf-8', compression='gzip', index_col=False)
country_cities_df = pd.read_csv(parsed_country_cities_file, encoding='utf-8', compression='gzip', index_col=False)

In [2]:
samples = [
    'Some random sentence about a country',
    'I am talking about Portugal',
    'Portuguese people always arrive late',
    'John doesn\'t think Spain is good for vacation',
    'This was easier than I thought...',
    'Now a Saudi Arabia comment',
    'With a Saudi Arabian nationality notation, along with religious christianity references'
]

In [3]:
def get_result_country_probability_dict(result_row, result_label):
    '''
    Auxiliary method for `get_likely_results`. For
    a given row it outputs a dictionary containing
    the estimated probability for referencing a 
    country.
    '''
    country_probability_dict = {}
    
    # Interpret country results
    if result_label == 'Country':
        country_code = result_row['ID']
        country_probability_dict[country_code] = 1.0
        
    # Interpret city results
    elif result_label == 'City':
        country_code = result_row['Country']
        country_probability_dict[country_code] = 1.0
    
    # Interpret nationality results
    elif result_label == 'Nationality':
        country_code = result_row['ID']
        country_probability_dict[country_code] = 1.0
        
    # Interpret religion results
    elif result_label == 'Religion':
        for country_code, country_prob in result_row.drop(['Religion', 'Affiliation']).iteritems():
            if country_prob is not float('NaN'):
                country_probability_dict[country_code] = float(country_prob)
    
    # Interpret currency results
    elif result_label == 'Currency':
        for country_code in result_row['Countries']:
            country_probability_dict[country_code] = 1 / len(result_row['Countries'])
    
    return country_probability_dict

def get_matching_row(text, df, col_labels):
    '''
    Tries to find matches between the provided
    text and the content of a certain dataframes'
    columns. It then returns the first row of the
    results as a safety measure (since there should
    only be one match for any given instance).
    '''
    upper_text = alpha_regex.sub('', text).upper()
    for col_label in col_labels:
        upper_series = df[col_label].astype(str).apply(lambda x: x.upper())
        matching_df = df[upper_series.str.contains(upper_text)]
        if (len(matching_df) > 0):
            return matching_df.iloc[0]
    return None

def get_likely_results(label, text):
    '''
    From a provided identity's label and text,
    this method returns a dictionary representing
    the possible countries it might be referencing.
    Its keys are the country codes and its values
    their corresponding probabilities.
    '''
    result = None
    
    # Check if it is a country/city
    if label == 'GPE':
        # Country check
        result = get_matching_row(text, country_nationality_df, ['ID', 'Official Name', 'Common Name'])
        if result is not None:
            print('GPE -> Country!')
            return get_result_country_probability_dict(result, 'Country')
        
        # City check
        result = get_matching_row(text, country_cities_df, ['City'])
        if result is not None:
            print('GPE -> City!')
            return get_result_country_probability_dict(result, 'City')
    
    # Check if it is a nationality/religion
    elif label == 'NORP':
        # Nationality check
        result = get_matching_row(text, country_nationality_df, ['Nationality'])
        if result is not None:
            print('NORP -> Nationality!')
            return get_result_country_probability_dict(result, 'Nationality')
        
        # Religion check
        result = get_matching_row(text, country_religion_df, ['Religion', 'Affiliation'])
        if result is not None:
            print('NORP -> Religion!')
            return get_result_country_probability_dict(result, 'Religion')
    
    # Check if it is a known currency
    elif label == 'MONEY':
        result = get_matching_row(text, currency_country_df, ['ID'])
        if result is not None:
            print('MONEY -> Currency')
            return get_result_country_probability_dict(result, 'Currency')
            
    return None

def get_interesting_text_entities(text):
    '''
    Creates a list of tuples, each containing 
    an identified entity's label and its text.
    '''
    # TODO create as many texts as necessary, with differently formatted
    # text, in order to more accurately find countries.
    target_entities = ['GPE', 'NORP', 'MONEY']
    document = nlp(text)
    return [(entity.label_, entity.text) for entity in document.ents if (entity.label_ in target_entities)]

def merge_probability_dicts(dict_l, dict_r):
    '''
    Creates a new dictionary containing all
    keys from both dictionaries. In case a
    key exists in both dictionaries, its
    value becomes the sum of both previously
    existing values.
    '''
    for country, probability in dict_r.items():
        dict_l[country] = dict_l.get(country, 0) + probability
    return dict_l

def normalize_probability_dict(probability_dict):
    '''
    This method normalizes all the probabilities
    in the provided dictionary (to values between
    0.0 and 1.0).
    '''
    value_list = list(probability_dict.values())
    sum_value = sum(value_list)
    
    normalized_dict = {}
    for country, probability in probability_dict.items():
        normalized_dict[country] = probability_dict[country] / sum_value
    return normalized_dict
        
def get_countries_from_content(text):
    '''
    This method analyses the input text and
    extracts the countries referenced in it.
    The ouput is in the form of a dictionary
    with the country codes as keys and their
    respective probability as values.
    '''
    interesting_entities = get_interesting_text_entities(text)
    country_probability_dict = {}
    for ent_label, ent_text in interesting_entities:
        print('\n{} -> {}'.format(ent_text, ent_label))
        results = get_likely_results(ent_label, ent_text)
        if results is not None:
            print(results)
            country_probability_dict = merge_probability_dicts(country_probability_dict, results)
        else:
            print('UNKNOWN')
    
    return normalize_probability_dict(country_probability_dict)

In [4]:
def get_most_likely_countries(country_probability_dict):
    '''
    Returns a list of the countries with 
    the highest probabilty in a dictionary.
    '''
    max_prob = max(list(country_probability_dict.values()))
    return [country for country, probability in country_probability_dict.items() if math.isclose(probability, max_prob)]

In [5]:
def is_tweet_about_country(tweet_text, country_code):
    '''
    Returns whether or not a Tweet is talking
    about a specific country.
    '''
    # TODO solve multi-worded countries and nationalities (by searching on every word separately in text.split())
    country_probability_dict = get_countries_from_content(tweet_text)
    most_likely_references = get_most_likely_countries(country_probability_dict)
    if country_code in most_likely_references:
        return True
    return False

In [6]:
is_tweet_about_country('I am talking about Portugal and the beautiful city LISBON...', 'PT')


Portugal -> GPE
GPE -> Country!
{'PT': 1.0}

LISBON -> GPE
GPE -> City!
{'PH': 1.0}


True

In [7]:
is_tweet_about_country('A nice overview of Swiss people and their connections to Germany', 'CH')


Swiss -> NORP
NORP -> Nationality!
{'CH': 1.0}

Germany -> GPE
GPE -> Country!
{'DE': 1.0}


True