# Setup

### Import Dependencies

In [35]:
import pandas as pd
import geopandas as gpd
import geopy
import numpy as np
import re
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster

# Import Data

### Import the full finishers list

In [36]:
# bring in the csv
df = pd.read_csv("../00_data/raw_data/M2019_finishers.csv")
df.head()

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,20m,21m,35k,22m,23m,24m,40k,25m,26m,mar
0,Michel Butter,Castricum,NLD,M,33,7,New Balance,2:25:06,05:33,37,...,1:43:56,1:50:22,1:55:04,1:56:42,2:03:26,2:10:32,2:16:06,2:17:07,2:23:41,2:25:06
1,Geoffrey Kamworor,Kapchorwa District,KEN,M,26,3,NIKE,2:08:13,04:54,1,...,1:38:59,1:44:07,1:47:34,1:48:44,1:53:20,1:57:59,2:01:48,2:02:30,2:07:11,2:08:13
2,Jack Rayner,Melbourne,AUS,M,23,9,NIKE,2:16:58,05:14,22,...,1:40:59,1:46:40,1:50:55,1:52:22,1:58:10,2:04:07,2:08:58,2:09:51,2:15:39,2:16:58
3,Stephen Sambu,"Tucson, AZ",USA,M,31,10,NIKE,2:11:11,05:01,7,...,1:39:12,1:44:15,1:47:58,1:49:13,1:54:16,1:59:39,2:03:59,2:04:46,2:09:58,2:11:11
4,Jared Ward,"Mapleton, UT",USA,M,31,6,Saucony,2:10:45,05:00,6,...,1:39:13,1:44:15,1:47:57,1:49:12,1:54:16,1:59:35,2:03:47,2:04:34,2:09:37,2:10:45


In [37]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53639 entries, 0 to 53638
Data columns (total 54 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   name                   53639 non-null  object
 1   geo_subregion          53622 non-null  object
 2   country                53639 non-null  object
 3   gender                 53639 non-null  object
 4   age                    53639 non-null  int64 
 5   bib                    53639 non-null  int64 
 6   team                   9282 non-null   object
 7   official_time          53639 non-null  object
 8   pace_per_mile          53639 non-null  object
 9   place_overall          53639 non-null  object
 10  place_gender           53639 non-null  object
 11  age_group              53639 non-null  object
 12  place_age-group        53639 non-null  object
 13  country_group          53637 non-null  object
 14  place_country          53639 non-null  object
 15  place_age‐graded   

### Set up country coding assistant

In [38]:
class GroupMap:
    def __init__(self):
        self.data = {}
        
    # Based on discussion here: https://stackoverflow.com/questions/55892600/python-triplet-dictionary
    # Takes in a dictionary and creates an identical dictionary under each value in the original.
    # This provides a means to look up keys from values.
    # For example, {'a':'apple', 'b':'butter', 'c':'cocoa'} is stored as
    # {'apple':{'a':'apple', 'b':'butter', 'c':'cocoa'},
    # 'butter':{'a':'apple', 'b':'butter', 'c':'cocoa'},
    # 'cocoa':{'a':'apple', 'b':'butter', 'c':'cocoa'}}
    # so, self['butter'] returns {'a':'apple', 'b':'butter', 'c':'cocoa'}, 
    # and self['butter']['a'] returns 'apple'.
    # Be warned, newer entries overwrite older ones.

    def add(self, group):
        for thing in group.keys():
            self.data[group[thing]] = group

    def __getitem__(self, item):
        return self.data[item]

In [39]:
# Read in the conversion table from Wikipedia.
temp_df = pd.read_html('https://en.wikipedia.org/wiki/Comparison_of_alphabetic_country_codes')[0]    


# Tidy up the columm headings.
temp_df = temp_df[['Country', 'IOC', 'ISO']]
temp_df = temp_df.rename(columns={'IOC': 'IOC_code', 'ISO': 'ISO_code'})

# Drop the footnote citations from the table text.
citation_pattern = re.compile('\[[1-9][0-9]?\]')

temp_df['Country'] = temp_df['Country'].str.replace(citation_pattern,'')

# Drop the parenthetical notes from the table text. (Note the leading space.)
parenthetical_pattern = re.compile(' \(.+\)')

temp_df['Country'] = temp_df['Country'].str.replace(parenthetical_pattern,'')

# Reorder the comma separated entries.
temp_df['Country'] = temp_df['Country'].str.split(', ').apply(lambda x: ' '.join(x[::-1]))


In [40]:
country_code_trictionary = GroupMap()

thang = temp_df.to_dict('records')

for record in thang:
    country_code_trictionary.add(record)

In [41]:
def country_coder(country_query, desired_code):
    try:
        output = country_code_trictionary[country_query][desired_code]
        return output
    except:
        return None

# Data Prep and Cleaning

In [42]:
# First we need to correct some known errors in the data.

# Countries

# Some addresses for China (CHN) are mis-coded as 'CHI'.
df['country'] = df['country'].str.replace('CHI','CHN')

# Some addresses for Romania (ROU) are mis-coded as 'ROM'.
df['country'] = df['country'].str.replace('ROM','ROU')

# Some addresses for the Democratic Republic of the Congo (COD) are mis-coded as 'Dem'
df['country'] = df['country'].str.replace('Dem','COD')

# One address for the USA (USA) is mis-coded as 'Sai'
df['country'] = df['country'].str.replace('Sai','USA')

# Geo_subregions

# The periods in 'L.I.C.' generate errors.
df['geo_subregion'] = df['geo_subregion'].str.replace('L.I.C.','Long Island City')

In [43]:
# How many countries are represented?
len(df['country'].unique()) 

137

In [44]:
# Now convert the country abbreviations to the expanded country names.
# The geocoder is a lot more successful with expanded names.

df['long_country'] = df['country'].apply(lambda country: country_coder(country, 'Country'))

In [45]:
# How many records were not expandable?

df.long_country.isnull().sum()

0

# Geocoding

### Pull in known addresses

In [46]:
address_list_df = pd.read_csv("address_cache.csv")
address_list_df.head()

Unnamed: 0,address,full_address,location
0,Castricum Netherlands,"Castricum, Noord-Holland, Nederland","(52.558830549999996, 4.639675526200153)"
1,Kapchorwa District Kenya,"District Farm Institute, Sironko Kapchorwa Roa...","(1.2992853, 34.3193213)"
2,Melbourne Australia,"Melbourne, City of Melbourne, Victoria, 3000, ...","(-37.8142176, 144.9631608)"
3,"Tucson, AZ United States","Tucson, Pima County, Arizona, United States","(32.2228765, -110.9748477)"
4,"Mapleton, UT United States","Mapleton, Utah County, Utah, 84664, United States","(40.1302338, -111.5785281)"


### Merge known addresses to the finisher data

In [47]:
# construct an 'address' for geocoding purposes
df['address'] = df['geo_subregion'] + " " + df['long_country']

# Merge with the cached address list.
df = df.merge(address_list_df, how='left', on='address')

In [48]:
df.head()

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,23m,24m,40k,25m,26m,mar,long_country,address,full_address,location
0,Michel Butter,Castricum,NLD,M,33,7,New Balance,2:25:06,05:33,37,...,2:03:26,2:10:32,2:16:06,2:17:07,2:23:41,2:25:06,Netherlands,Castricum Netherlands,"Castricum, Noord-Holland, Nederland","(52.558830549999996, 4.639675526200153)"
1,Geoffrey Kamworor,Kapchorwa District,KEN,M,26,3,NIKE,2:08:13,04:54,1,...,1:53:20,1:57:59,2:01:48,2:02:30,2:07:11,2:08:13,Kenya,Kapchorwa District Kenya,"District Farm Institute, Sironko Kapchorwa Roa...","(1.2992853, 34.3193213)"
2,Jack Rayner,Melbourne,AUS,M,23,9,NIKE,2:16:58,05:14,22,...,1:58:10,2:04:07,2:08:58,2:09:51,2:15:39,2:16:58,Australia,Melbourne Australia,"Melbourne, City of Melbourne, Victoria, 3000, ...","(-37.8142176, 144.9631608)"
3,Stephen Sambu,"Tucson, AZ",USA,M,31,10,NIKE,2:11:11,05:01,7,...,1:54:16,1:59:39,2:03:59,2:04:46,2:09:58,2:11:11,United States,"Tucson, AZ United States","Tucson, Pima County, Arizona, United States","(32.2228765, -110.9748477)"
4,Jared Ward,"Mapleton, UT",USA,M,31,6,Saucony,2:10:45,05:00,6,...,1:54:16,1:59:35,2:03:47,2:04:34,2:09:37,2:10:45,United States,"Mapleton, UT United States","Mapleton, Utah County, Utah, 84664, United States","(40.1302338, -111.5785281)"


In [52]:
df['location'][0].split(', ')[1]

'4.639675526200153)'

In [14]:
# How many records still need geocoding?

df.location.isnull().sum()

694

### Create an address list for geocoding

In [15]:
# build a list of unique addresses so that we can spend less time making geocode requests.
address_list = df.loc[df['location'].isnull(), 'address'].unique()
len(address_list)

535

In [16]:
address_list

array(['Kapkitony Kenya', "New Taipei City People's Republic of China",
       'O Fallon, MO United States', 'Chesitek Kenya', nan,
       "Hsinchu City People's Republic of China", 'Fpo, AP United States',
       'København Sv Denmark', "Miaoli County People's Republic of China",
       'Loenen Gld Netherlands', 'Seoul, Mapo-gu United States',
       'Nakorn Ratchasima Thailand', 'North Plainfiel, NJ United States',
       'Steeplechase Corner Costa Rica', 'Rocca D Evandro Italy',
       'Segataya-Ku Japan', "Kibbutz Mishmar Ha'emek Israel",
       'Fort Lauderdalefort Lauderdale, FL United States',
       'New York City Uzbekistan', 'Castelldfels Spain',
       'Saint Pierre D Oleron France',
       'La Plata, Buenos aires United States', 'Zargoza Spain',
       'Tavagnacco Ud Italy', 'Ballerup, Denmark United States',
       'Amsterdam-Zuid Antigua and Barbuda', 'Maspalmos Spain',
       'Seeblick Ot Wassersuppe Germany',
       'Buenos Aires Argentina, Martinez United States',
    

In [17]:
query_df = pd.DataFrame(address_list)
query_df.to_csv('..00_data/geodata/list.csv')

### Commence geocoding

In [18]:
# set up the geocoder

locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

In [19]:
location_dict = {}
problem_locations = []
counter = 0
goal = len(address_list)

for address in address_list:
    counter += 1
    try:
        location_result = geocode(address)
        if location_result == None:
            problem_locations.append(address)
            print("{} of {}. No location found for {}.".format(counter, goal, address))
        else:
            location_dict[address] = geocode(address)
            print("{} of {}. Found location for {}.".format(counter, goal, address))
    except:
        problem_locations.append(address)
        print("{} of {}. No location found for {}.".format(counter, goal, address))


1 of 535. No location found for Kapkitony Kenya.
2 of 535. No location found for New Taipei City People's Republic of China.
3 of 535. No location found for O Fallon, MO United States.
4 of 535. No location found for Chesitek Kenya.
5 of 535. Found location for nan.
6 of 535. No location found for Hsinchu City People's Republic of China.
7 of 535. No location found for Fpo, AP United States.
8 of 535. No location found for København Sv Denmark.
9 of 535. No location found for Miaoli County People's Republic of China.
10 of 535. No location found for Loenen Gld Netherlands.
11 of 535. No location found for Seoul, Mapo-gu United States.
12 of 535. No location found for Nakorn Ratchasima Thailand.
13 of 535. No location found for North Plainfiel, NJ United States.
14 of 535. No location found for Steeplechase Corner Costa Rica.
15 of 535. No location found for Rocca D Evandro Italy.
16 of 535. No location found for Segataya-Ku Japan.
17 of 535. No location found for Kibbutz Mishmar Ha'eme

130 of 535. No location found for Hamme Denmark.
131 of 535. No location found for Khuder Soum Mongolia.
132 of 535. No location found for Saint-Herblains France.
133 of 535. No location found for New York, État de New York United States.
134 of 535. No location found for Vitacura Chad.
135 of 535. No location found for Johannesburg, Missouriteng United States.
136 of 535. No location found for Distrito Nacional, Santo Domingo United States.
137 of 535. No location found for East Waterboro, QLD United States.
138 of 535. No location found for Guishan Dist. People's Republic of China.
139 of 535. No location found for Kyoto Mukoucity Japan.
140 of 535. No location found for New York, New YorkNY United States.
141 of 535. No location found for Kaohsung People's Republic of China.
142 of 535. No location found for Ulaanbaatar, Bayanzurkh district United States.
143 of 535. No location found for Taichung City People's Republic of China.
144 of 535. No location found for Neufgrange French P

255 of 535. No location found for Peterboroughbaston United Kingdom.
256 of 535. No location found for Hamburg Andorra.
257 of 535. No location found for Drancy Antarctica.
258 of 535. No location found for Bangsaothong Thailand.
259 of 535. No location found for Floral Park, Colombia United States.
260 of 535. No location found for Uniondale, riyadh United States.
261 of 535. No location found for Las Palmas - Islas Canarias Spain.
262 of 535. No location found for Bienrode-Waggum-Bevenrode Germany.
263 of 535. No location found for Bronx, État de New York United States.
264 of 535. No location found for Te Awamutu, Waikato United States.
265 of 535. No location found for Nara-Shi, Nara-Ken United States.
266 of 535. No location found for Snaoya Norway.
267 of 535. No location found for Rambouilllet France.
268 of 535. No location found for Condomínio Chácaras Do Alto Da Nova Campinas Brazil.
269 of 535. No location found for Bezirk Treptow-Köpenick Germany.
270 of 535. No location fo

383 of 535. No location found for Sebago Lake, QLD United States.
384 of 535. No location found for Margherita Di Savoia Jamaica.
385 of 535. No location found for Cumming Italy.
386 of 535. No location found for Herewegwijk En Helpman Netherlands.
387 of 535. No location found for New Rochelle, New orkN United States.
388 of 535. No location found for Bladelbladel Netherlands.
389 of 535. No location found for Dublin, Dun Laoghaire United States.
390 of 535. No location found for S.M. De Tucuman Argentina.
391 of 535. No location found for Stadtbezirke Iv Germany.
392 of 535. No location found for U.S Virgin Island, St. Thomas United States.
393 of 535. No location found for Brain Sur L Authion France.
394 of 535. No location found for Longs, South CarolinaJeff Leake United States.
395 of 535. No location found for Melbounre Australia.
396 of 535. No location found for Salles D Angles France.
397 of 535. No location found for Montugny Le Bretonneux France.
398 of 535. No location foun

508 of 535. No location found for Esbo, Finland United States.
509 of 535. No location found for Kualalumpur Malaysia.
510 of 535. No location found for Apo, Oost Vlaanderen United States.
511 of 535. No location found for Worcester, Buenos aires United States.
512 of 535. No location found for Framingham, Metro Manila United States.
513 of 535. No location found for Bodegraven, Holandia Południowa United States.
514 of 535. No location found for Fort Worth, UNITED STATES United States.
515 of 535. No location found for Amphoe Dan Khun Thot Thailand.
516 of 535. No location found for Hungtington, NY United States.
517 of 535. No location found for Taguig City, Metro Manila United States.
518 of 535. Found location for Thonburi Thailand.
519 of 535. Found location for Wishart Australia.
520 of 535. Found location for Oadby United Kingdom.
521 of 535. Found location for Pollestres France.
522 of 535. No location found for Clearwater Bay, Sai Kung People's Republic of China.
523 of 535. N

In [20]:
len(problem_locations)

530

In [21]:
problem_locations_df = pd.DataFrame(problem_locations)
problem_locations_df.to_csv('..00_data/geodata/problem_addresses.csv')

In [29]:
df['location'] = df['address'].apply(lambda address: location_dict[address] if address in location_dict else None)
df['latitude'] = df['location'].apply(lambda loc: loc.latitude if loc else None)
df['longitude'] = df['location'].apply(lambda loc: loc.longitude if loc else None)

In [24]:
df.location.isnull().sum()

53618

In [34]:
df['full_address'][0].loc

AttributeError: 'str' object has no attribute 'loc'

In [None]:
address_cache_df = pd.DataFrame.from_dict(location_dict)
address_cache_df = address_cache_df.transpose()

In [None]:
address_cache_df

In [None]:
address_cache_df = address_cache_df.reset_index()
address_cache_df.rename(columns={'index': 'address', 0: 'full_address', 1: 'location'}, inplace=True)

In [None]:
address_cache_df


In [None]:
# define the path and name for the output file
output_csv = "..00_data/geodata/address_cache.csv"

import csv

# We want to match the order of columns if we are appending new entries to an existing file.
### Because we are using a dictionary to capture the race results, and of the way dictionaries are ordered,
### It is possible for different scraping runs to attempt to order the columns differently.
### That would be confusing, so we force whatever order was established in the first run.
### (The bottom line is that it would really be a good idea to use a database here instead.)

# Try to read the header from the existing file. If they exist, match the column order.
try:
    with open(output_csv, 'r', encoding="utf-8") as f:
        reader = csv.reader(f)
        header_list = next(reader)
        # It is possible to be missing columns, so check to see if we have everything.
        for x in header_list:
            if x not in address_cache_df.columns:
                address_cache_df[x] = ''
        address_cache_df = address_cache_df[header_list]
# If we weren't able to open the file, we are going to create one from scratch.
except:
    print(f"{output_csv} does not exist, so I'll create it!")

# This either creates the file, or appends to it without headers. We already took steps to make sure
# the columns in finishers_df match the headers (if they exist).
with open(output_csv, 'a', encoding="utf-8") as f:
    address_cache_df.to_csv(f, header=f.tell()==0, index = False, line_terminator = '\n')

In [None]:
mapping_df = df[pd.notnull(df["latitude"])]

In [None]:
mapping_df.info(verbose = True)

In [None]:
df.info(verbose=True)

In [None]:
map1 = folium.Map(
    location=[40.7128, -74.0060],
    tiles='cartodbpositron',
    zoom_start=12,
)

In [None]:
mapping_df.apply(lambda row:folium.CircleMarker(location=[row["latitude"], row["longitude"]]).add_to(map1), axis=1)
map1

In [None]:
map1.save("map7.html")

In [None]:
folium_map = folium.Map(location=[40.7128, -74.0060],
                        zoom_start=12,
                        tiles='cartodbpositron')


FastMarkerCluster(data=list(zip(mapping_df['latitude'].values, mapping_df['longitude'].values))).add_to(folium_map)
folium.LayerControl().add_to(folium_map)
folium_map

In [None]:
folium_map.save("map8.html")