<div style="border:2px solid black; padding:10px">
    
# <font color="blue">Objective: </font>Find any city and/or country names mentioned in each of the news headlines
</div>

# Import Dependencies

In [14]:
import re
import unidecode
import pandas as pd

from geonamescache import GeonamesCache

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Displaying pandas columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Import Data

In [2]:
file = "data/headlines.txt"

In [31]:
headline_file = open(file,'r')
headlines = [line.strip()
             for line in headline_file.readlines()]
num_headlines = len(headlines)
print(f"{num_headlines} headines have been loaded")

650 headines have been loaded


<hr style="border-top: 2px solid black;">

# Map Country and City names to regexes

- Remove accents

In [15]:
def name_to_regex(name):
    decoded_name = unidecode.unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

<hr style="border-top: 2px solid black;">

In [7]:
gc = GeonamesCache()

In [9]:
countries = [country['name'] for country in gc.get_countries().values()]

In [16]:
country_to_name = {name_to_regex(name) : name for name in countries}

In [18]:
cities = [city['name'] for city in gc.get_cities().values()]

In [19]:
city_to_name = {name_to_regex(name): name for name in cities}

<hr style="border-top: 2px solid black;">

# Create Dataframe from Extracted Locations in Headlines

In [20]:
def get_name_in_text(text, dictionary):
    for regex, name in sorted(dictionary.items(),
                              key=lambda x: x[1]):
        if regex.search(text):
            return name
    return None

In [32]:
# Creates a list of matched countries that has either the country name or none for headlines without countries
matched_countries = [get_name_in_text(headline, country_to_name)
                     for headline in headlines]

In [34]:
# Creates a list of matched cities that has the names of cities in headlines or 'none' if missing
matched_cities = [get_name_in_text(headline, city_to_name)
                  for headline in headlines]

In [35]:
# Create a pandas dataframe with this information
data = {'Headline': headlines, 'City': matched_cities,
        'Country': matched_countries}

In [37]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Headline,City,Country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,


# Find the city and country names in each row of dataset

# Get the list of cities using <font color="red">gc.get_cities()</font>

In [None]:
cities = gc.get_cities()

In [None]:
cities_list = [cities[key]['name'] for key in cities]

# Extract City

In [None]:
cities_list[0]

In [None]:
city_pattern = '|'.join(cities_list)

In [None]:
city_pattern[:100]

In [None]:
df['city'] = df['headlines'].str.extract('({})'.format(city_pattern), expand=False)

# Get the list of countries using <font color="red">gc.get_countries()</font>

In [None]:
countries = gc.get_countries()

In [None]:
countries_list = [countries[key]['name'] for key in countries]

# Extract Country

In [None]:
country_pattern = '|'.join(countries_list)

In [None]:
country_pattern[:100]

In [None]:
df['country'] = df['headlines'].str.extract('({})'.format(country_pattern), expand=False)

In [None]:
df.head()

In [None]:
df['headlines'].apply(unidecode.unidecode)
df.head()