PART ONE: PARSING THE NEWS HEADLINES


    Objective:Find any city and/or country names mentioned in each of the news headlines.

WORKFLOW:

Step One

    reference necessary modules:numpy\pandas\unidecode\geonamescache\re\time

In [9]:
import time
import numpy as np
import pandas as pd
from unidecode import unidecode
import geonamescache
import re

Step Two

    bringing in external data through geonamescache

In [10]:
gc = geonamescache.GeonamesCache()
countries_info = gc.get_countries()
us_states_info = gc.get_us_states()
us_counties_info = gc.get_us_counties()
cities_info = gc.get_cities()

Step Three
    
    create function1
    function:get the longest string in the list

In [11]:
def longest_str(original_list):
    if original_list:
        result = original_list[0]
        for i in original_list:
            if len(i) > len(result):
                result = i
        return result
    else:
        return None

Step Four

    create function2
    function:find any country names mentioned in each of the news headlines through regular expressions 
    Notice:We entere the title and the official city name and return the value are already unicode.This function is called after match_city_country is called.

In [12]:
def match_country_from_headline(headline, original_country_name):
    matched_country = []
    for country_entry_value in countries_info.values():
        country_name = unidecode(country_entry_value['name']).strip()
        compiled_normalized_name = re.compile(r'\b' + re.escape(country_name) + r'\b')
        country_name_matches = compiled_normalized_name.findall(headline)
        if country_name_matches:
            matched_country.append(country_name_matches[0])
    if matched_country:
        return longest_str(matched_country)
    else:
        return original_country_name

Step Four

    create function3
    function:match cities by title and preliminary match countries
    Notice:The title is passed in unicode and the return value is unicode.This function is called before the match_country_from_headline function  

In [13]:
counter_match_city = 0
def match_city_country(headline):
    global counter_match_city
    counter_match_city += 1
    print('begin to match headline {}: {}'.format(counter_match_city, headline))

    matched_city = {}
    for city_ID, city_entry_value in cities_info.items():
        city_names = [unidecode(city_entry_value['name']).strip()]
        for alternate in city_entry_value['alternatenames']:
            striped_normalized_alternate = unidecode(alternate).strip()
            if striped_normalized_alternate:
                city_names.append(striped_normalized_alternate)
        for name in city_names:
            compiled_normalized_name = re.compile(r'\b' + re.escape(name) + r'\b')
            city_name_matches = compiled_normalized_name.findall(headline)
            if city_name_matches:
                matched_city[city_ID] = [city_name_matches[0], city_names[0], len(city_entry_value['alternatenames']),
                                         city_entry_value['population']]
    if matched_city:
        # First, find the longest match. If the longest match is of the same length, choose the official city name instead of the nickname. If still cannot choose, choose the city with more nicknames, and if still cannot choose, choose the city with more people, for the nickname number and population reflect city fame and reputation.  
        # Find the longest match and the longest matching length. Remove the other match
        max_length = 0
        for skimmed_city_info in matched_city.values():
            max_length = max(max_length, len(skimmed_city_info[0]))
        has_formal_name = False
        delete_ID = []
        for city_ID, skimmed_city_info in matched_city.items():
            if len(skimmed_city_info[0]) < max_length:
                delete_ID.append(city_ID)
            else:
                if skimmed_city_info[0] == skimmed_city_info[1]:
                    has_formal_name = True
        for id in delete_ID:
            matched_city.pop(id)
        delete_ID.clear()

        # If there are formal names, remove other informal ones
        if has_formal_name:
            for city_ID, skimmed_city_info in matched_city.items():
                if skimmed_city_info[0] != skimmed_city_info[1]:
                    delete_ID.append(city_ID)
            for id in delete_ID:
                matched_city.pop(id)
            delete_ID.clear()

        # Delete cities with a low number of nicknames. The nicknames can reflect a city's cultural influence  
        max_num_alternate = 0
        for skimmed_city_info in matched_city.values():
            max_num_alternate = max(max_num_alternate, skimmed_city_info[2])
        for city_ID, skimmed_city_info in matched_city.items():
            if skimmed_city_info[2] < max_num_alternate:
                delete_ID.append(city_ID)
        for id in delete_ID:
            matched_city.pop(id)
        delete_ID.clear()

        # Returns the largest population of the remaining matches. The population can also indicate the influence of the city  
        max_population = 0
        for skimmed_city_info in matched_city.values():
            max_population = max(max_population, skimmed_city_info[3])
        for city_ID, skimmed_city_info in matched_city.items():
            if skimmed_city_info[3] == max_population:
                country_name = countries_info[cities_info[city_ID]['countrycode']]['name']
                return skimmed_city_info[1], unidecode(country_name).strip()

    matched_us_state = []
    for us_state_entry_value in us_states_info.values():
        compiled_state_name = re.compile(r'\b' + re.escape(us_state_entry_value['name']) + r'\b')
        state_name_matches = compiled_state_name.findall(headline)
        if state_name_matches:
            matched_us_state.append(state_name_matches[0])
    if matched_us_state:
        return longest_str(matched_us_state), 'United States'

    # There are more us_counties with the same name, but it doesn't matter for the corresponding countries are all United States  
    matched_us_county = []
    for county_info in us_counties_info:
        compiled_county_name = re.compile(r'\b' + re.escape(county_info['name']) + r'\b')
        county_name_matches = compiled_county_name.findall(headline)
        if county_name_matches:
            matched_us_county.append(county_name_matches[0])
    if matched_us_county:
        return longest_str(matched_us_county), 'United States'
    return None, None

Step Five

    create function4
    function:put the extracted data into a pandas DataFrame with three columns: headline, city, country.
    

In [14]:
def read_and_extract_headline_data(start, end=None):
    result = np.genfromtxt("database/headlines.txt", 'str', delimiter='\n', encoding='utf-8')
    result = pd.DataFrame(result, columns=['headline'], dtype='str')
    result.drop_duplicates(ignore_index=True, inplace=True)
    if end is None:
        result = result.iloc[start:].reset_index(drop=True)
    else:
        result = result.iloc[start:end].reset_index(drop=True)
    result['headline'] = result['headline'].apply(unidecode)
    num_headline = len(result)

    result['city'], result['country'] = None, None
    for i in range(num_headline):
        result.loc[i, 'city'], result.loc[i, 'country'] = match_city_country(result.loc[i, 'headline'])
        result.loc[i, 'country'] = match_country_from_headline(result.loc[i, 'headline'], result.loc[i, 'country'])
    return result

Step Six

    create function5
    function:build the main function. Save the result as a CSV file

In [15]:
def main():
    result = read_and_extract_headline_data(0)
    result.to_csv('database/extracted_data.csv', index=False, encoding='utf-8')

    print(result.describe())
    print(result.info())
    print(result[result.isnull().T.any()])

Step Six

    create function6
    function:calculate the running time based on seconds alone

In [16]:
if __name__ == "__main__":
    timestamp1 = time.time()
    main()
    timestamp2 = time.time()
    print("总共用时 %f 秒" % (timestamp2 - timestamp1))

begin to match headline 1: Zika Outbreak Hits Miami
begin to match headline 2: Could Zika Reach New York City?
begin to match headline 3: First Case of Zika in Miami Beach
begin to match headline 4: Mystery Virus Spreads in Recife, Brazil
begin to match headline 5: Dallas man comes down with case of Zika
begin to match headline 6: Trinidad confirms first Zika case
begin to match headline 7: Zika Concerns are Spreading in Houston
begin to match headline 8: Geneve Scientists Battle to Find Cure
begin to match headline 9: The CDC in Atlanta is Growing Worried
begin to match headline 10: Zika Infested Monkeys in Sao Paulo
begin to match headline 11: Brownsville teen contracts Zika virus
begin to match headline 12: Mosquito control efforts in St. Louis take new tactics with Zika threat
begin to match headline 13: San Juan reports 1st U.S. Zika-related death amid outbreak
begin to match headline 14: Flu outbreak in Galveston, Texas
begin to match headline 15: Zika alert a Manila now threaten

KeyboardInterrupt: 