PHASE ONE: PARSING THE NEWS HEADLINES


    Objective:Find any city and/or country names mentioned in each of the news headlines.

WORKFLOW:

Step One

    reference necessary modules:numpy\pandas\unidecode\geonamescache\re\time

In [1]:
import time
import numpy as np
import pandas as pd
from unidecode import unidecode
import geonamescache
import re

Step Two

    bringing in external data through geonamescache

In [2]:
gc = geonamescache.GeonamesCache()
countries_info = gc.get_countries()
us_states_info = gc.get_us_states()
us_counties_info = gc.get_us_counties()
cities_info = gc.get_cities()

Step Three
    
    create function1
    function:get the longest string in the list

In [3]:
def longest_str(original_list):
    if original_list:
        result = original_list[0]
        for i in original_list:
            if len(i) > len(result):
                result = i
        return result
    else:
        return None

Step Four

    create function2
    function:find any country names mentioned in each of the news headlines through regular expressions 
    Notice:We entere the title and the official city name and return the value are already unicode.This function is called after match_city_country is called.

In [4]:
def match_country_from_headline(headline, original_country_name):
    matched_country = []
    for country_entry_value in countries_info.values():
        country_name = unidecode(country_entry_value['name']).strip()
        compiled_normalized_name = re.compile(r'\b' + re.escape(country_name) + r'\b')
        country_name_matches = compiled_normalized_name.findall(headline)
        if country_name_matches:
            matched_country.append(country_name_matches[0])
    if matched_country:
        return longest_str(matched_country)
    else:
        return original_country_name

Step Four

    create function3
    function:match cities by title and preliminary match countries
    Notice:The title is passed in unicode and the return value is unicode.This function is called before the match_country_from_headline function  

In [5]:
counter_match_city = 0
def match_city_country(headline):
    global counter_match_city
    counter_match_city += 1
    print('begin to match headline {}: {}'.format(counter_match_city, headline))

    matched_city = {}
    for city_ID, city_entry_value in cities_info.items():
        city_names = [unidecode(city_entry_value['name']).strip()]
        for alternate in city_entry_value['alternatenames']:
            striped_normalized_alternate = unidecode(alternate).strip()
            if striped_normalized_alternate:
                city_names.append(striped_normalized_alternate)
        for name in city_names:
            compiled_normalized_name = re.compile(r'\b' + re.escape(name) + r'\b')
            city_name_matches = compiled_normalized_name.findall(headline)
            if city_name_matches:
                matched_city[city_ID] = [city_name_matches[0], city_names[0], len(city_entry_value['alternatenames']),
                                         city_entry_value['population']]
    if matched_city:
        # First, find the longest match. If the longest match is of the same length, choose the official city name instead of the nickname. If still cannot choose, choose the city with more nicknames, and if still cannot choose, choose the city with more people, for the nickname number and population reflect city fame and reputation.  
        # Find the longest match and the longest matching length. Remove the other match
        max_length = 0
        for skimmed_city_info in matched_city.values():
            max_length = max(max_length, len(skimmed_city_info[0]))
        has_formal_name = False
        delete_ID = []
        for city_ID, skimmed_city_info in matched_city.items():
            if len(skimmed_city_info[0]) < max_length:
                delete_ID.append(city_ID)
            else:
                if skimmed_city_info[0] == skimmed_city_info[1]:
                    has_formal_name = True
        for id in delete_ID:
            matched_city.pop(id)
        delete_ID.clear()

        # If there are formal names, remove other informal ones
        if has_formal_name:
            for city_ID, skimmed_city_info in matched_city.items():
                if skimmed_city_info[0] != skimmed_city_info[1]:
                    delete_ID.append(city_ID)
            for id in delete_ID:
                matched_city.pop(id)
            delete_ID.clear()

        # Delete cities with a low number of nicknames. The nicknames can reflect a city's cultural influence  
        max_num_alternate = 0
        for skimmed_city_info in matched_city.values():
            max_num_alternate = max(max_num_alternate, skimmed_city_info[2])
        for city_ID, skimmed_city_info in matched_city.items():
            if skimmed_city_info[2] < max_num_alternate:
                delete_ID.append(city_ID)
        for id in delete_ID:
            matched_city.pop(id)
        delete_ID.clear()

        # Returns the largest population of the remaining matches. The population can also indicate the influence of the city  
        max_population = 0
        for skimmed_city_info in matched_city.values():
            max_population = max(max_population, skimmed_city_info[3])
        for city_ID, skimmed_city_info in matched_city.items():
            if skimmed_city_info[3] == max_population:
                country_name = countries_info[cities_info[city_ID]['countrycode']]['name']
                return skimmed_city_info[1], unidecode(country_name).strip()

    matched_us_state = []
    for us_state_entry_value in us_states_info.values():
        compiled_state_name = re.compile(r'\b' + re.escape(us_state_entry_value['name']) + r'\b')
        state_name_matches = compiled_state_name.findall(headline)
        if state_name_matches:
            matched_us_state.append(state_name_matches[0])
    if matched_us_state:
        return longest_str(matched_us_state), 'United States'

    # There are more us_counties with the same name, but it doesn't matter for the corresponding countries are all United States  
    matched_us_county = []
    for county_info in us_counties_info:
        compiled_county_name = re.compile(r'\b' + re.escape(county_info['name']) + r'\b')
        county_name_matches = compiled_county_name.findall(headline)
        if county_name_matches:
            matched_us_county.append(county_name_matches[0])
    if matched_us_county:
        return longest_str(matched_us_county), 'United States'
    return None, None

Step Five

    create function4
    function:put the extracted data into a pandas DataFrame with three columns: headline, city, country.
    

In [6]:
def read_and_extract_headline_data(start, end=None):
    result = np.genfromtxt("database/headlines.txt", 'str', delimiter='\n', encoding='utf-8')
    result = pd.DataFrame(result, columns=['headline'], dtype='str')
    result.drop_duplicates(ignore_index=True, inplace=True)
    if end is None:
        result = result.iloc[start:].reset_index(drop=True)
    else:
        result = result.iloc[start:end].reset_index(drop=True)
    result['headline'] = result['headline'].apply(unidecode)
    num_headline = len(result)

    result['city'], result['country'] = None, None
    for i in range(num_headline):
        result.loc[i, 'city'], result.loc[i, 'country'] = match_city_country(result.loc[i, 'headline'])
        result.loc[i, 'country'] = match_country_from_headline(result.loc[i, 'headline'], result.loc[i, 'country'])
    return result

Step Six

    create function5
    function:build the main function. Save the result as a CSV file

In [7]:
def main():
    result = read_and_extract_headline_data(0)
    result.to_csv('database/extracted_data.csv', index=False, encoding='utf-8')

    print(result.describe())
    print(result.info())
    print(result[result.isnull().T.any()])

Step Six

    create function6
    function:calculate the running time based on seconds alone

In [8]:
if __name__ == "__main__":
    timestamp1 = time.time()
    main()
    timestamp2 = time.time()
    print("总共用时 %f 秒" % (timestamp2 - timestamp1))

begin to match headline 1: Zika Outbreak Hits Miami
begin to match headline 2: Could Zika Reach New York City?
begin to match headline 3: First Case of Zika in Miami Beach
begin to match headline 4: Mystery Virus Spreads in Recife, Brazil
begin to match headline 5: Dallas man comes down with case of Zika
begin to match headline 6: Trinidad confirms first Zika case
begin to match headline 7: Zika Concerns are Spreading in Houston
begin to match headline 8: Geneve Scientists Battle to Find Cure
begin to match headline 9: The CDC in Atlanta is Growing Worried
begin to match headline 10: Zika Infested Monkeys in Sao Paulo
begin to match headline 11: Brownsville teen contracts Zika virus
begin to match headline 12: Mosquito control efforts in St. Louis take new tactics with Zika threat
begin to match headline 13: San Juan reports 1st U.S. Zika-related death amid outbreak
begin to match headline 14: Flu outbreak in Galveston, Texas
begin to match headline 15: Zika alert a Manila now threaten

begin to match headline 122: Villavicencio under Zika threat
begin to match headline 123: Two more Zika cases confirmed in Laredo
begin to match headline 124: New Delhi Addressing Zika Concerns
begin to match headline 125: New Zika Case in Kota Kinabalu, Malaysia
begin to match headline 126: New medicine wipes out West Nile Virus in Ventura
begin to match headline 127: Meningitis re-emerges in Nassau
begin to match headline 128: Zika reaches Johor Bahru, Malaysia
begin to match headline 129: Nha Trang Zika Outbreak
begin to match headline 130: First Zika Case in Cincinnati
begin to match headline 131: Molo Cholera Spread Causing Concern
begin to match headline 132: Norovirus Exposure in Hong Kong
begin to match headline 133: 19 new Zika Cases in Sengkang
begin to match headline 134: Zika cases concern Charlotte residents
begin to match headline 135: Batangas Tourism Takes a Hit as Virus Spreads
begin to match headline 136: Johannesburg Patient in Critical Condition after Contracting Pn

begin to match headline 242: Frisco Woman Tests Positive For Zika Virus
begin to match headline 243: Authorities are Worried about the Spread of Norovirus in Dubai
begin to match headline 244: Benton Residents Recieve Pneumonia vaccine
begin to match headline 245: Case of Hepatitis A Reported in Calgary
begin to match headline 246: Zika Outbreak in Pinewood
begin to match headline 247: Will Swine Flu vaccine help Ljubljana?
begin to match headline 248: Bronchitis re-emerges in Tehran
begin to match headline 249: Greenwich Establishes Zika Task Force
begin to match headline 250: Mad Cow case in Murcia
begin to match headline 251: Mad Cow Disease Spreads to Margate
begin to match headline 252: Zika Strikes St. Petersburg
begin to match headline 253: Will West Nile Virus vaccine help Parsons?
begin to match headline 254: Will the Mad Cow Outbreak Reach Vienna?
begin to match headline 255: More Patients in Orange are Getting Diagnosed with Chickenpox
begin to match headline 256: Lower Hosp

begin to match headline 356: Lower Hospitalization in Monroe after Hepatitis D Vaccine becomes Mandatory
begin to match headline 357: New medicine wipes out Dengue in Saginaw
begin to match headline 358: Spike of Rhinovirus Cases in Sevierville
begin to match headline 359: Duisburg up in Arms over Mad Cow Disease
begin to match headline 360: New medicine wipes out Mumps in Saint Charles
begin to match headline 361: Case of Chikungunya Reported in Gaithersburg
begin to match headline 362: How to Avoid Respiratory Syncytial Virus in San Bernardino
begin to match headline 363: Laventille authorities confirmed the spread of Rhinovirus
begin to match headline 364: More Zika patients reported in Waco
begin to match headline 365: Will Herpes vaccine help Newcastle?
begin to match headline 366: Erie County sets Zika traps
begin to match headline 367: More people in Huron are infected with Dengue every year
begin to match headline 368: How to Avoid Norovirus in Greenville
begin to match headlin

begin to match headline 470: Durango is infested with Hepatitis B
begin to match headline 471: Hepatitis A has not Left Staten Island
begin to match headline 472: Mpika authorities confirmed the spread of Chikungunya
begin to match headline 473: Gladstone Encounters Severe Symptoms of Dengue
begin to match headline 474: Mad Cow Disease Hits London
begin to match headline 475: New Milford Patient in Critical Condition after Contracting Respiratory Syncytial Virus
begin to match headline 476: More people in Palo Alto are infected with HIV every year
begin to match headline 477: HIV Symptoms Spread all over Hinthada
begin to match headline 478: Zika symptoms spotted in Quito
begin to match headline 479: Vineland authorities confirmed the spread of Chlamydia
begin to match headline 480: Sulu, Zamboanga brace for Zika
begin to match headline 481: More Zika patients reported in Custodia
begin to match headline 482: Tupelo Residents Recieve Syphilis vaccine
begin to match headline 483: Rumors

begin to match headline 584: Marquette tests new cure for Hepatitis A
begin to match headline 585: Norovirus Vaccine is now Required in Anaheim
begin to match headline 586: Zika Outbreak in Manaus
begin to match headline 587: Zika spreads to Caucasia
begin to match headline 588: Case of Malaria Reported in Aurora
begin to match headline 589: Manchester Residents Recieve Influenza vaccine
begin to match headline 590: Zika spreads to Kamphaeng Phet
begin to match headline 591: Rotavirus Vaccine is now Required in Labasa
begin to match headline 592: The Spread of Hepatitis E in Fargo has been Confirmed
begin to match headline 593: Zika Outbreak in Mexicali
begin to match headline 594: West Nile Virus Keeps Spreading in Hauppauge
begin to match headline 595: Bronchitis Keeps Spreading in Hayward
begin to match headline 596: Schools in Coamo Closed Due to Rhinovirus Outbreak
begin to match headline 597: Zika arrives in Dangriga
begin to match headline 598: Zika spreads to Plant City
begin t