<div style="border:2px solid black; padding:10px">
    
# <font color="blue">Objective: </font>Extract City and Country Information from News Headlines
</div>

# Import Dependencies

In [1]:
import re
import unidecode
import pandas as pd

# Geolocation information
from geonamescache import GeonamesCache

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Displaying pandas columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

<hr style="border-top: 2px solid black;">

# Import Data

In [2]:
file = "data/headlines.txt"

In [3]:
headline_file = open(file,'r')
headlines = [line.strip()
             for line in headline_file.readlines()]
num_headlines = len(headlines)
print(f"{num_headlines} headines have been loaded")

650 headines have been loaded


<hr style="border-top: 2px solid black;">

# Function to great regexes for matching

In [4]:
def data_to_regex(name):
    # ASCII transliterations of Unicode text
    decoded_name = unidecode.unidecode(name)
    # If the name is not already in ASCII characters, then it will convert it
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    # if it is already in ASCII characters, then it
    # returns it as a regular expression pattern objects, which can be used for pattern matching
    # flags=re.IGNORECASE ignores case    
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

<hr style="border-top: 2px solid black;">

# Create lists that contain city and country names using <code>GeonamesCache()</code>

In [5]:
# Instantiate GeonamesCache()
gc = GeonamesCache()

## Create lists that contain Country and City information from <code>gc</code>

In [6]:
# Use a list comprehension to create a list that contains contries inforation from <code>gc.get_countries().values()</code>
countries = [country['name'] for country in gc.get_countries().values()]

In [7]:
# Use a list comprehension to create a list that contains city inforation from <code>gc.get_countries().values()</code>
cities = [city['name'] for city in gc.get_cities().values()]

## Use a Dictionary Comprehension and the <code>data_to_regex()</code> function to map country and city names to regexes

In [8]:
# This dictionary comprehension will iterate through the countries list, 
# Assign this value as the element, and then feed the same element to the <code>name_to_regex()</code>
# The output is the key
country_to_name = {data_to_regex(name) : name for name in countries}

In [9]:
# This dictionary comprehension will iterate through the cities list, 
# Assign this value as the element, and then feed the same element to the <code>name_to_regex()</code>
# The output is the key
city_to_name = {data_to_regex(name): name for name in cities}

<hr style="border-top: 2px solid black;">

# Create Dataframe from Extracted Locations in Headlines

### Function that will compare each headline and will either return the name of the country or city, or if it is not present, it will return none

In [10]:
# Function takes in a string, and a dictionary to compare
def get_data_in_text(text, dictionary):
    # iterate through each key and value pair in the dictionary (name of the country or city, and the regex object)
    for regex, name in sorted(dictionary.items(),
                              key=lambda x: x[1]):
    # each individual regex object is used to see if it matches with the text
    # if it is found, then it will return the text
        if regex.search(text):
            return name
    # if it is not found, then None will be returned
    return None

In [11]:
# Creates a list of matched countries that has either the country name or none for headlines without countries
matched_countries = [get_data_in_text(headline, country_to_name)
                     for headline in headlines]

In [12]:
# Creates a list of matched cities that has the names of cities in headlines or 'none' if missing
matched_cities = [get_data_in_text(headline, city_to_name)
                  for headline in headlines]

In [13]:
# Create a pandas dataframe with this information
data = {'Headline': headlines, 'City': matched_cities,
        'Country': matched_countries}

In [14]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Headline,City,Country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,


<hr style="border-top: 2px solid black;">

# Evaluate Data

In [15]:
df.describe()

Unnamed: 0,Headline,City,Country
count,650,619,15
unique,647,510,10
top,Spanish Flu Spreading through Madrid,Of,Brazil
freq,2,45,3


<div style="border:1px solid black; padding:10px">
<font color="blue">Note:</font><br>
There seems to be an error with in the City top value. Of is not a city.
</div>

## Evaluate the <code>Of</code> error in the dataset

In [16]:
df[df["City"]=='Of'].head()

Unnamed: 0,Headline,City,Country
53,Case of Measles Reported in Vancouver,Of,
74,Authorities are Worried about the Spread of Br...,Of,
80,Authorities are Worried about the Spread of Ma...,Of,
84,Rochester authorities confirmed the spread of ...,Of,
109,Tokyo Encounters Severe Symptoms of Meningitis,Of,


<div style="border:1px solid black; padding:10px">
<font color="blue">Note:</font><br>
There are 45 (7%) instances that have 'Of' listed as a City. This is a mistake. <br>
    These headlines have real cities that are not captured.
    Some headline may have more than one city listed.
</div>

<hr style="border-top: 2px solid black;">

# Fix Erroneaous City Output

 - Create a function that returns a set containing all the cities listed in a headline.
 - Apply function to city column and create a new column that contains all the cities found in a given headline
 - Create a new column that contains the count of the number of cities counted in a headline
 - Determine the number of headlines with more than one city

In [17]:
# The headlines will be a parameter
def get_cities_in_headline(headline):
    # Create an empty set (sets do not contain duplicates)
    cities_in_headline = set()
    # Iterate through each city_to_name dictionary that was created earlier
    for regex, name in city_to_name.items():
    # Use regex.search() to find all instances in the headline that match        
        match = regex.search(headline)
    # if there is a match (True)   
        if match:
                # then it will check if the first index character is uppercase      
            if headline[match.start()].isupper():
                # If it is uppercase, it will add the name to the set
                cities_in_headline.add(name)
    #  Return the set
    return list(cities_in_headline)

In [18]:
# Apply the get_cities_in_headline(headline) to the df and create a new column
df['Cities'] = df['Headline'].apply(get_cities_in_headline)

In [19]:
# Then create a new column that counts the number of cities listed in each headline
df['Num_cities'] = df['Cities'].apply(len)

In [20]:
# Filter dataframe for those headlines with more than 1 city
df_multiple_cities = df[df.Num_cities > 1]

In [21]:
num_rows, _ = df_multiple_cities.shape
print(f"{num_rows} headlines match multiple cities")

69 headlines match multiple cities


# Update the City column that contains more than 1 entry, and restrict that entry to the name of the city that has the longest name.

 - This will remove the 'Of' or incomplete cities
 - Create a function that returns the longest name in the cities

In [22]:
# Function takes in the cities parameter (this column has either just a city, or cities listed)
def get_longest_city(cities):
    if cities:
        return max(cities, key=len)
# if there is not city in cities (some headlines only have countries), then return none
    return None

In [23]:
# Apply the get_longest_city function to the cities column and assign output to the city column
df['City'] = df['Cities'].apply(get_longest_city)

# Evaluate the Country Data

In [24]:
# create a countries dataframe that filters out all instances in the original dataframe that had None for country
df_countries = df[df.Country.notnull()][['City',
                                         'Country',
                                         'Headline']]
print(df_countries.to_string(index=False))

             City    Country                                           Headline
           Recife     Brazil            Mystery Virus Spreads in Recife, Brazil
 Ho Chi Minh City    Vietnam     Zika cases in Vietnam's Ho Chi Minh City surge
          Bangkok   Thailand                     Thailand-Zika Virus in Bangkok
       Piracicaba     Brazil                Zika outbreak in Piracicaba, Brazil
            Klang   Malaysia                   Zika surfaces in Klang, Malaysia
   Guatemala City  Guatemala  Rumors about Meningitis spreading in Guatemala...
      Belize City     Belize                 Belize City under threat from Zika
         Campinas     Brazil                   Student sick in Campinas, Brazil
      Mexico City     Mexico               Zika outbreak spreads to Mexico City
    Kota Kinabalu   Malaysia           New Zika Case in Kota Kinabalu, Malaysia
      Johor Bahru   Malaysia                 Zika reaches Johor Bahru, Malaysia
        Hong Kong  Hong Kong            

# Identify Instances with no data (no country or city name)

In [25]:
df_unmatched = df[df.City.isnull()]
num_unmatched = len(df_unmatched)
print(f"{num_unmatched}")
print(df_unmatched.head(10)[['Headline']].values)

39
[['Louisiana Zika cases up to 26']
 ['Zika infects pregnant woman in Cebu']
 ['Spanish Flu Sighted in Antigua']
 ['Zika case reported in Oton']
 ['Hillsborough uses innovative trap against Zika 20 minutes ago']
 ['Maka City Experiences Influenza Outbreak']
 ['West Nile Virus Outbreak in Saint Johns']
 ['Malaria Exposure in Sussex']
 ['Greenwich Establishes Zika Task Force']
 ['Will West Nile Virus vaccine help Parsons?']]


<div style="border:1px solid black; padding:10px">
<font color="blue">Note:</font><br>
There are 39 (6-7%) instances that have a city that could not be matched to the GeoNamesCache.<br>
These instances will be dropped in this analysis.
</div>

In [26]:
df = df[~df.City.isnull()][['City', 'Headline', 'Country']]

<hr style="border-top: 2px solid black;">

# Get and Assign the Latitude and Longitude to each instance in dataframe using <code>get_cities_by_name(city)</code>

 - method can be used to return the coordinates of the city input

In [27]:
# Create two empty lists for the coodinates
latitudes, longitudes = [], []

In [28]:
# Use a for loop to iterate through each value of the city column (each city)
for city_name in df.City.values:
    # Get the city with the largest population size
    city = max(gc.get_cities_by_name(city_name),
              key=lambda x: list(x.values())[0]['population'])
    # Get the first instance of this list
    city = list(city.values())[0]
    # Extracts the coordinates and appends to list
    latitudes.append(city['latitude'])
    longitudes.append(city['longitude'])

In [29]:
# Use the dataframe.assign() method to create two new columns and add the values there
df = df.assign(Latitude=latitudes, Longitude=longitudes)

# Inspect Data

In [30]:
df.head()

Unnamed: 0,City,Headline,Country,Latitude,Longitude
0,Miami,Zika Outbreak Hits Miami,,25.77427,-80.19366
1,New York City,Could Zika Reach New York City?,,40.71427,-74.00597
2,Miami Beach,First Case of Zika in Miami Beach,,25.79065,-80.13005
3,Recife,"Mystery Virus Spreads in Recife, Brazil",Brazil,-8.05389,-34.88111
4,Dallas,Dallas man comes down with case of Zika,,32.78306,-96.80667


In [31]:
# Store variables
%store df
%store gc
%store headlines

Stored 'df' (DataFrame)
Stored 'gc' (GeonamesCache)
Stored 'headlines' (list)


<hr style="border-top: 2px solid black;">