# Discovering disease outbreaks
## 1. extracting city & country data from news headlines
- analyse and identify potential problems in of the data
- normalize headlines and remove accents marks (unidecode)
- optimize where possible

In [1]:
from unidecode import unidecode
import geonamescache
import re
import timeit
import pandas as pd

In [2]:
# get and decode the headlines
headlines=[]
with open('data\headlines.txt', 'r') as dirty_headlines:
    for line in dirty_headlines:
        headlines.append(unidecode(line.strip()))

In [3]:
# let's check the difference
if len(headlines) != len(set(headlines)):
    print(f'The list of headlines does contain doubles. ' +
          f'\n# headlines: {len(headlines)}, converted to set we have {len(set(headlines))}')
    
# using a dict, we lose some precision

The list of headlines does contain doubles. 
# headlines: 650, converted to set we have 647


In [4]:
# load geo data
gc = geonamescache.GeonamesCache()
countries = gc.get_countries_by_names()
cities = gc.get_cities()

In [5]:
# some test/verification methods

def cities_contains(search: str) -> bool:
    # does the cities list contain the given city?
    for city in cities.values():
        if search in city.get("name"):
            return True
    return False

def countries_contains(search: str) -> bool:
    # does the countries list contain the searched country
    return search in countries

**The first version of the script needed about *1000sec* to execute and required a better approach:**

In [6]:
result = []


def analyse_data():
    #analyse all headlines

    for headline in headlines:
        print(f'Current headline: "{headline}"')

        result_country = None

        # let's test the countries
        for country in countries:
            # make regex (compiled version probably doesnt do much in this case)
            regex = '\\b'+unidecode(country)+'\\b'
            compiled_regex = re.compile(regex)
            match = compiled_regex.search(headline)
            if match:
                result_country = country
                print(f' - country: {country}')
                break

        resulted_city = None

        # let's test cities
        for city in cities.values():
            # make regex
            current_city = unidecode(city.get('name'))
            regex = '\\b' + current_city + '\\b'
            compiled_regex = re.compile(regex)
            match = compiled_regex.search(headline)
            if match:
                if resulted_city is None or len(current_city) > len(resulted_city):
                    resulted_city = current_city

        print(f' - city: {resulted_city}') 

        #add the data to the result list
        result.append([headline, result_country, resulted_city])
        
print("skipping v1 ...")
# execution_time = timeit.timeit(analyse_data, number=1)
# print(execution_time)

skipping v1 ...


In [7]:
result_v2 = [[headline, None, None] for headline in headlines]

def analyse_data_v2():
    
    #analyse all countries in the headlines
    for country in countries:
        # compile regex for faster checking, match complete "words"
        regex = '\\b' + country + '\\b'
        compiled_regex = re.compile(regex)

        # search country in headline
        for headline_item in result_v2:
            match = compiled_regex.search(headline_item[0])
            if match:
                headline_item[1] = country
                print(f'Found country "{country}" in headline: "{headline_item[0]}"')
                break
    
    #analyse all cities appearing in the headlines
    for city in cities.values():
        current_city = city.get('name')

        # compile regex for faster checking, match complete "words"
        regex = '\\b' + current_city + '\\b'
        compiled_regex = re.compile(regex)

        for headline_item in result_v2:
            match = compiled_regex.search(headline_item[0])  # best not to use "ignore case" for cases like "Goes" and "Of"
            if match:
                # if there are multiple cities in a headline, we pick the larger length city as "New York City" takes
                # precedence over "York"
                if headline_item[2] is None:
                    headline_item[2] = current_city
                    print(f'Found city: "{current_city}" in "{headline_item[0]}"')
                elif len(headline_item[2]) < len(current_city):
                    print(f'Replacing previously found city "{headline_item[2]}" in "{headline_item[0]}" with "{current_city}"')
                    headline_item[2] = current_city

execution_time = timeit.timeit(analyse_data_v2, number=1)
print(f'\nExecuted v2 in {execution_time} seconds')
                
# print(result_v2)

Found country "Brazil" in headline: "Mystery Virus Spreads in Recife, Brazil"
Found country "Belize" in headline: "Belize City under threat from Zika"
Found country "Guatemala" in headline: "Rumors about Meningitis spreading in Guatemala City have been refuted"
Found country "Hong Kong" in headline: "Norovirus Exposure in Hong Kong"
Found country "Mexico" in headline: "Zika outbreak spreads to Mexico City"
Found country "Malaysia" in headline: "Zika surfaces in Klang, Malaysia"
Found country "Panama" in headline: "Outbreak of Zika in Panama City"
Found country "Singapore" in headline: "Zika cases in Singapore reach 393"
Found country "Thailand" in headline: "Thailand-Zika Virus in Bangkok"
Found country "Vietnam" in headline: "Zika cases in Vietnam's Ho Chi Minh City surge"
Found city: "Dubai" in "Authorities are Worried about the Spread of Norovirus in Dubai"
Found city: "Yerevan" in "West Nile Virus Symptoms Spread all over Yerevan"
Found city: "Luanda" in "Tuberculosis Hits Luanda"


Found city: "Havana" in "Zika Virus Transmission Detected in Havana"
Found city: "Praia" in "Praia tests new cure for Pneumonia"
Found city: "Pilsen" in "Pilsen Encounters Severe Symptoms of Malaria"
Found city: "Stuttgart" in "More Patients in Stuttgart are Getting Diagnosed with Rabies"
Found city: "Hilden" in "Case of Mad Cow Disease Reported in Hilden"
Found city: "Hamburg" in "Will Meningitis vaccine help Hamburg?"
Found city: "Duisburg" in "Duisburg up in Arms over Mad Cow Disease"
Found city: "Cham" in "Zika Troubles come to Kampong Cham"
Found city: "Bonn" in "Contaminated Meat Brings Trouble for Bonn Farmers"
Found city: "Berlin" in "Spike of Chlamydia Cases in Berlin"
Found city: "Copenhagen" in "Norovirus Keeps Spreading in Copenhagen"
Found city: "Quisqueya" in "Zika symptoms spotted in Quisqueya"
Found city: "Quito" in "Zika symptoms spotted in Quito"
Found city: "Portoviejo" in "Outbreak of Zika in Portoviejo"
Found city: "La Libertad" in "Zika Troubles come to La Liberta

Found city: "Mexicali" in "Zika Outbreak in Mexicali"
Found city: "Medina" in "Meningitis re-emerges in Medina"
Found city: "Madera" in "Madera Patient in Critical Condition after Contracting Rabies"
Found city: "Hidalgo" in "Four cases of Zika in Hidalgo County"
Found city: "Johor Bahru" in "Zika reaches Johor Bahru, Malaysia"
Found city: "Klang" in "Zika surfaces in Klang, Malaysia"
Replacing previously found city "Kota" in "New Zika Case in Kota Kinabalu, Malaysia" with "Kota Kinabalu"
Found city: "Petaling Jaya" in "Petaling Jaya man ill from Zika"
Found city: "Kuala Lumpur" in "Kuala Lumpur is Hit By Zika Threat"
Found city: "Kuching" in "Zika spreads to Kuching"
Found city: "Sibu" in "Zika symptoms spotted in Sibu"
Found city: "Miri" in "Zika arrives in Miri"
Found city: "Lagos" in "Varicella re-emerges in Lagos"
Found city: "Ibadan" in "Vericella spreading in Ibadan"
Found city: "Ibadan" in "Ibadan tests new cure for Malaria"
Found city: "Abuja" in "Authorities are Worried about

Found city: "Princeton" in "Princeton Encounters Severe Symptoms of Dengue"
Found city: "St. Petersburg" in "Zika Strikes St. Petersburg"
Found city: "Sarasota" in "New Zika Case Confirmed in Sarasota County"
Replacing previously found city "Florida" in "Zika Patient in Seminole, Florida" with "Seminole"
Found city: "Tallahassee" in "Tallahassee Doctors Discuss Zika Virus"
Found city: "Tamarac" in "Zika in Tamarac!"
Replacing previously found city "Bay" in "Tampa Bay Area Zika Case Count Climbs" with "Tampa"
Found city: "Vero Beach" in "Can Zika make it here to Vero Beach?"
Replacing previously found city "Palm Beach" in "Zika arrives in West Palm Beach" with "West Palm Beach"
Found city: "Westchester" in "Zika virus confirmed in Westchester"
Found city: "Winter Park" in "Zika spreads to Winter Park"
Found city: "Alpharetta" in "West Nile Virus Exposure in Alpharetta"
Found city: "Atlanta" in "The CDC in Atlanta is Growing Worried"
Found city: "Canton" in "More Patients in Canton are G

Found city: "East Moline" in "Will Gonorrhea vaccine help East Moline?"
Found city: "Joliet" in "Rumors about Rotavirus Spreading in Joliet have been Refuted"
Found city: "Lansing" in "The Spread of Hepatitis E in Lansing has been Confirmed"
Found city: "Urbana" in "Urbana Encounters Severe Symptoms of Meningitis"
Found city: "Crawfordsville" in "More Patients in Crawfordsville are Getting Diagnosed with Rabies"
Found city: "South Bend" in "How to Avoid Hepatitis E in South Bend"
Found city: "Belmont" in "Zika in Belmont, Belmont worry"
Found city: "Danvers" in "Hepatitis D Keeps Spreading in Danvers"
Found city: "Framingham" in "Framingham Residents Receive Measles vaccine"
Found city: "Medford" in "How to Avoid Rhinovirus in Medford"
Replacing previously found city "Bedford" in "Rumors about Mumps Spreading in New Bedford have been Refuted" with "New Bedford"
Found city: "Rockland" in "Zika virus case reported in Rockland"
Found city: "Detroit" in "Varicella Keeps Spreading in Detroi

Found city: "Caracas" in "Caracas patient dies"
Replacing previously found city "Croix" in "Saint Croix under Zika threat" with "Saint Croix"
Replacing previously found city "Ho" in "Zika cases in Vietnam's Ho Chi Minh City surge" with "Ho Chi Minh City"
Replacing previously found city "Trang" in "Nha Trang Zika Outbreak" with "Nha Trang"
Found city: "Hanoi" in "Hospitals in Hanoi fill up with Zika patients"
Found city: "Sanaa" in "Chlamydia Exposure in Sanaa"
Found city: "Springs" in "Tuberculosis re-emerges in Silver Springs"
Found city: "Johannesburg" in "Malaria is Spreading in Johannesburg"
Found city: "Johannesburg" in "Johannesburg Patient in Critical Condition after Contracting Pneumonia"
Found city: "Mpika" in "Mpika authorities confirmed the spread of Chikungunya"
Found city: "Kitwe" in "More people in Kitwe are infected with Respiratory Syncytial Virus every year"
Found city: "Harare" in "Harare is infested with Pneumonia"

Executed v2 in 19.301241800000007 seconds


In [8]:
columns = ["headline","countries","cities"]
df = pd.DataFrame.from_records(result_v2, columns=columns)
df.head()

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
