# Setup

### Import dependencies

In [1]:
import pandas as pd
import csv
import datetime

### Set file locations

In [2]:
# file locations for import
# raw_data_file = "../00_data/raw_data/a70414_finishers.csv" # 2007 Brooklyn Half
# raw_data_file = "../00_data/raw_data/19nyc60_finishers.csv" # 2019 NYC 60K
# raw_data_file = "../00_data/raw_data/20WH5K_finishers.csv" # 2020 Washington Heights 5K
raw_data_file = "../00_data/geodata/19nyc60_finishers_geocoded.csv" # 2019 NYC 60K with geocoding

# file locations for export

# Import Data

### Import data from csv

In [3]:
df = pd.read_csv(raw_data_file)

In [4]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   375 non-null    object 
 1   geo_subregion          375 non-null    object 
 2   country                375 non-null    object 
 3   gender                 375 non-null    object 
 4   age                    375 non-null    int64  
 5   bib                    375 non-null    int64  
 6   team                   183 non-null    object 
 7   official_time          375 non-null    object 
 8   pace_per_mile          375 non-null    object 
 9   place_overall          375 non-null    int64  
 10  place_gender           375 non-null    int64  
 11  age_group              375 non-null    object 
 12  place_age-group        375 non-null    int64  
 13  country_group          375 non-null    object 
 14  place_country          375 non-null    int64  
 15  place_

In [5]:
if df.columns[0] == 'Unnamed: 0':
    df = df.drop(df.columns[0], axis=1)

In [6]:
df.head()

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,time_age‐graded,percentage_age‐graded,net_time,net_place,long_country,address,full_address,location,latitude,longitude
0,Bobby Asher,"Bronx, NY",USA,M,34,275,Van Cortlandt TC,5:21:44,08:38,29,...,0:00:00,0%,5:21:44,29,United States,"Bronx, NY United States","The Bronx, Bronx County, New York, United States","(40.8466508, -73.8785937)",40.846651,-73.878594
1,Manuel Romero,"New York, NY",USA,M,48,292,Front Runners NY,5:58:35,09:38,75,...,0:00:00,0%,5:58:35,75,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
2,Pierre Rousseau,Montreal,CAN,M,56,53,,5:26:59,08:47,35,...,0:00:00,0%,5:26:59,35,Canada,Montreal Canada,"Montréal, Agglomération de Montréal, Montréal ...","(45.4972159, -73.6103642)",45.497216,-73.610364
3,Deborah McDuffie-Saat,"New York, NY",USA,F,61,369,New York Flyers,7:49:42,12:36,265,...,0:00:00,0%,7:49:42,265,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
4,Robert Wilson,"Bronx, NY",USA,M,41,282,,6:09:54,09:56,86,...,0:00:00,0%,6:09:54,86,United States,"Bronx, NY United States","The Bronx, Bronx County, New York, United States","(40.8466508, -73.8785937)",40.846651,-73.878594


### Manipulating time observations

In [7]:
df['official_time'] = pd.to_timedelta(df['official_time'])

In [8]:
# pd.to_timedelta needs arguments in the form 'HH:MM:SS', which means we need to add '00:' to the front of pace per mile
# times. This assumes a pace of less than 1 hour per mile, which is a safe assumption.
df['pace_per_mile'] = pd.to_timedelta('00:' + df['pace_per_mile'].astype(str))

In [9]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype          
---  ------                 --------------  -----          
 0   name                   375 non-null    object         
 1   geo_subregion          375 non-null    object         
 2   country                375 non-null    object         
 3   gender                 375 non-null    object         
 4   age                    375 non-null    int64          
 5   bib                    375 non-null    int64          
 6   team                   183 non-null    object         
 7   official_time          375 non-null    timedelta64[ns]
 8   pace_per_mile          375 non-null    timedelta64[ns]
 9   place_overall          375 non-null    int64          
 10  place_gender           375 non-null    int64          
 11  age_group              375 non-null    object         
 12  place_age-group        375 non-null    int64      

In [10]:
df

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,time_age‐graded,percentage_age‐graded,net_time,net_place,long_country,address,full_address,location,latitude,longitude
0,Bobby Asher,"Bronx, NY",USA,M,34,275,Van Cortlandt TC,05:21:44,00:08:38,29,...,0:00:00,0%,5:21:44,29,United States,"Bronx, NY United States","The Bronx, Bronx County, New York, United States","(40.8466508, -73.8785937)",40.846651,-73.878594
1,Manuel Romero,"New York, NY",USA,M,48,292,Front Runners NY,05:58:35,00:09:38,75,...,0:00:00,0%,5:58:35,75,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
2,Pierre Rousseau,Montreal,CAN,M,56,53,,05:26:59,00:08:47,35,...,0:00:00,0%,5:26:59,35,Canada,Montreal Canada,"Montréal, Agglomération de Montréal, Montréal ...","(45.4972159, -73.6103642)",45.497216,-73.610364
3,Deborah McDuffie-Saat,"New York, NY",USA,F,61,369,New York Flyers,07:49:42,00:12:36,265,...,0:00:00,0%,7:49:42,265,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
4,Robert Wilson,"Bronx, NY",USA,M,41,282,,06:09:54,00:09:56,86,...,0:00:00,0%,6:09:54,86,United States,"Bronx, NY United States","The Bronx, Bronx County, New York, United States","(40.8466508, -73.8785937)",40.846651,-73.878594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,Robert Katrinak,"Sandy Hook, CT",USA,M,56,223,,05:48:12,00:09:21,58,...,0:00:00,0%,5:48:12,58,United States,"Sandy Hook, CT United States","Sandy Hook, Elliott County, Kentucky, United S...","(38.0864739, -83.1262839)",38.086474,-83.126284
371,Kiyoshi Matsukawa,"New York, NY",USA,M,47,163,,06:18:37,00:10:10,103,...,0:00:00,0%,6:18:37,103,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
372,Rebecca Valencia,"Brooklyn, NY",USA,F,30,388,,06:20:03,00:10:12,106,...,0:00:00,0%,6:20:03,106,United States,"Brooklyn, NY United States","Brooklyn, Kings County, New York, United States","(40.6501038, -73.9495823)",40.650104,-73.949582
373,Bethanne Souza,"East Elmhurst, NY",USA,F,36,458,Hellgate RR,07:56:49,00:12:48,279,...,0:00:00,0%,7:56:49,279,United States,"East Elmhurst, NY United States","East Elmhurst, Queens, Queens County, New York...","(40.7612123, -73.8651358)",40.761212,-73.865136


In [11]:
# Find official time for a specific finisher.
df.loc[df['name'] == 'Jack Craft']['official_time']

341   07:07:39
Name: official_time, dtype: timedelta64[ns]

In [12]:
# Find all finishers within a certain time window.
df.loc[(df['official_time'] < pd.to_timedelta(6, unit = 'h')) & (df['official_time'] > pd.to_timedelta(5, unit = 'h'))]

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,time_age‐graded,percentage_age‐graded,net_time,net_place,long_country,address,full_address,location,latitude,longitude
0,Bobby Asher,"Bronx, NY",USA,M,34,275,Van Cortlandt TC,05:21:44,00:08:38,29,...,0:00:00,0%,5:21:44,29,United States,"Bronx, NY United States","The Bronx, Bronx County, New York, United States","(40.8466508, -73.8785937)",40.846651,-73.878594
1,Manuel Romero,"New York, NY",USA,M,48,292,Front Runners NY,05:58:35,00:09:38,75,...,0:00:00,0%,5:58:35,75,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
2,Pierre Rousseau,Montreal,CAN,M,56,53,,05:26:59,00:08:47,35,...,0:00:00,0%,5:26:59,35,Canada,Montreal Canada,"Montréal, Agglomération de Montréal, Montréal ...","(45.4972159, -73.6103642)",45.497216,-73.610364
5,Julio Avalos,"New York, NY",USA,M,38,183,West Side Y,05:31:40,00:08:54,39,...,0:00:00,0%,5:31:40,39,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
10,Farah Visslailli,"Brooklyn, NY",USA,F,34,309,South Central Brooklyn Runners,05:54:11,00:09:30,71,...,0:00:00,0%,5:54:11,71,United States,"Brooklyn, NY United States","Brooklyn, Kings County, New York, United States","(40.6501038, -73.9495823)",40.650104,-73.949582
12,Qiang Chen,"Syosset, NY",USA,M,52,352,Misty Mountain Runners,05:28:13,00:08:49,38,...,0:00:00,0%,5:28:13,38,United States,"Syosset, NY United States","Syosset, Oyster Bay, Nassau County, New York, ...","(40.818714150000005, -73.50015652837524)",40.818714,-73.500157
13,David Law,"Corona, NY",USA,M,32,273,Queens Distance Runners,05:52:50,00:09:28,66,...,0:00:00,0%,5:52:50,66,United States,"Corona, NY United States","Corona, Queens, Queens County, New York, 11368...","(40.7469593, -73.8601456)",40.746959,-73.860146
20,Conrado Bermudez,"Jersey City, NJ",USA,M,46,23,,05:16:53,00:08:30,25,...,0:00:00,0%,5:16:53,25,United States,"Jersey City, NJ United States","Jersey City, Hudson County, New Jersey, United...","(40.7281575, -74.0776417)",40.728158,-74.077642
21,Prelja Sinistaj,"Kew Gardens, NY",USA,M,57,202,47 American Sign Language & English HS,05:36:52,00:09:03,48,...,0:00:00,0%,5:36:52,48,United States,"Kew Gardens, NY United States","Kew Gardens, Queens, Queens County, New York, ...","(40.7139415, -73.830742)",40.713941,-73.830742
22,Nicolas Wical,"Brooklyn, NY",USA,M,35,12,Prospect Park Track Club,05:22:40,00:08:40,31,...,0:00:00,0%,5:22:40,31,United States,"Brooklyn, NY United States","Brooklyn, Kings County, New York, United States","(40.6501038, -73.9495823)",40.650104,-73.949582


In [13]:
# Need to match this object type.
pd.to_timedelta(30, unit = 'm')

Timedelta('0 days 00:30:00')

In [14]:
target_runner = 'Jack Craft'
half_window = pd.to_timedelta(10, unit = 'm')
window_center = pd.to_timedelta(df.loc[df['name'] == target_runner]['official_time'].values[0])
window_max = window_center + half_window
window_min = window_center - half_window

In [15]:
half_window

Timedelta('0 days 00:10:00')

In [16]:
window_center

Timedelta('0 days 07:07:39')

In [17]:
window_max

Timedelta('0 days 07:17:39')

In [18]:
window_min

Timedelta('0 days 06:57:39')

In [19]:
finish_buddies_df = df[(df['official_time'] < window_max) & (df['official_time'] > window_min)]
finish_buddies_df

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,time_age‐graded,percentage_age‐graded,net_time,net_place,long_country,address,full_address,location,latitude,longitude
14,Ki Chi Chi,"Brooklyn, NY",USA,M,39,525,,07:02:22,00:11:20,188,...,0:00:00,0%,7:02:22,188,United States,"Brooklyn, NY United States","Brooklyn, Kings County, New York, United States","(40.6501038, -73.9495823)",40.650104,-73.949582
16,Al Shabana,"New York, NY",USA,M,48,121,,07:12:21,00:11:36,203,...,0:00:00,0%,7:12:21,203,United States,"New York, NY United States","New York, United States","(40.7127281, -74.0060152)",40.712728,-74.006015
36,John Hagel,"Washington, DC",USA,M,39,222,,07:01:08,00:11:18,185,...,0:00:00,0%,7:01:08,185,United States,"Washington, DC United States","Washington, District of Columbia, United States","(38.8949924, -77.0365581)",38.894992,-77.036558
39,Maciej Macak,"Brooklyn, NY",USA,M,52,143,POLSKA Running Team,07:15:13,00:11:41,212,...,0:00:00,0%,7:15:13,212,United States,"Brooklyn, NY United States","Brooklyn, Kings County, New York, United States","(40.6501038, -73.9495823)",40.650104,-73.949582
40,Kaitlin Peretto,"Collinsville, CT",USA,F,33,430,,07:07:00,00:11:28,195,...,0:00:00,0%,7:07:00,195,United States,"Collinsville, CT United States","Collinsville, Madison County, Illinois, 62234,...","(38.6703267,-89.9845476)",38.670327,-89.984548
63,Yasia Sorbo,"Staten Island, NY",USA,F,44,315,Staten Island AC,07:03:18,00:11:22,189,...,0:00:00,0%,7:03:18,189,United States,"Staten Island, NY United States","Staten Island, Richmond County, New York, Unit...","(40.5834557, -74.1496048)",40.583456,-74.149605
64,Ari Gonzales,"Middlesex, NJ",USA,M,43,524,Fil Am Tri,07:17:29,00:11:45,223,...,0:00:00,0%,7:17:29,223,United States,"Middlesex, NJ United States","Middlesex, Middlesex County, New Jersey, 08846...","(40.5734911, -74.494037)",40.573491,-74.494037
68,Keith Binder,"Patterson, NY",USA,M,46,119,,07:09:09,00:11:31,199,...,0:00:00,0%,7:09:09,199,United States,"Patterson, NY United States","Patterson, 1, Front Street, Patterson, Town of...","(41.5117325, -73.6043034)",41.511733,-73.604303
77,Dmitry Zlotsky,"Livingston, NJ",USA,M,59,158,,07:16:37,00:11:43,219,...,0:00:00,0%,7:16:37,219,United States,"Livingston, NJ United States","Livingston, Essex County, New Jersey, 07039, U...","(40.7959335, -74.3148713)",40.795934,-74.314871
82,Benjamin Sarsgard,"Baltimore, MD",USA,M,40,261,,07:00:51,00:11:18,184,...,0:00:00,0%,7:00:51,184,United States,"Baltimore, MD United States","Baltimore, Maryland, United States","(39.2908816, -76.610759)",39.290882,-76.610759


In [20]:
### Map the finish buddies!
import matplotlib.pyplot as plt
import folium

In [21]:
mapping_df = finish_buddies_df[pd.notnull(finish_buddies_df["location"])]

In [22]:
map1 = folium.Map(
    location=[40.7128, -74.0060],
    tiles='cartodbpositron',
    zoom_start=12,
)
mapping_df.apply(lambda row:folium.CircleMarker(location=[row["latitude"], row["longitude"]], tooltip=row["address"]).add_to(map1), axis=1)
map1

## Where are runners from?
(This was moved to the 'Geocoder' notebook.)

# Checking completeness

I would like to know if the race data is complete, i.e. has every finisher been captured. We can check the place numbers to see if they form a numeric sequence from 1st place to last place. There should be no gaps. There may be duplicates or triplicates due to different classifications (runners, wheelchair, handcycle) mixed together.

Some ideas:
1. Ignore the duplicates and look for gaps only. Could miss a gap in the runners if a wheelchair is filling it and vice versa.
2. Check to make sure the duplicates are also sequential, i.e. there are no gaps in the duplicates.
3. Infer athlete classification during scraping. Note that athlete classification is not explicitly stated in the race results. See https://results.nyrr.org/event/M2019/result/251 as an example.
4. Infer athlete classification during cleaning. See https://results.nyrr.org/event/M2019/result/251, https://results.nyrr.org/event/M2019/result/101, and https://results.nyrr.org/event/M2019/result/341 as examples.

### Finding Gaps

In [23]:
def missing_elements(L):
  
    original_list = [x for x in range(min(L), max(L)+1)]
    num_list = set(L)
    return (list(num_list ^ set(original_list)))

In [24]:
import random
L1 = [10,11,13,14,15,16,17,19,20] # missing 12 and 18
L2 = [10,11,13,14,15,16,16,17,18,20] # missing 12 and 19, has an extra 16
L3 = L1 + L2
random.shuffle(L3) # missing 12, 2 of everything else except 18 and 19, 3 16's, random order.

In [25]:
# Test it out
print(f"List 1 result:{missing_elements(L1)}")
print(f"List 2 result:{missing_elements(L2)}")
print(f"List 3 result:{missing_elements(L3)}")

List 1 result:[12, 18]
List 2 result:[12, 19]
List 3 result:[12]


In [26]:
# Try it on the list of finisher places. This should be empty if we found all the finishers.
print(missing_elements(df.place_overall.tolist()))

[]


In [27]:
# Try it on the list of bib numbers. This should not be empty.
print(missing_elements(df.bib.tolist()))

[2, 3, 10, 13, 14, 15, 16, 17, 20, 26, 30, 36, 39, 46, 61, 62, 67, 72, 83, 85, 88, 90, 91, 94, 100, 102, 105, 111, 118, 122, 125, 128, 129, 130, 131, 132, 135, 136, 137, 139, 151, 152, 159, 173, 181, 193, 195, 200, 201, 212, 215, 217, 224, 225, 226, 240, 247, 248, 250, 253, 255, 260, 267, 276, 281, 285, 287, 288, 294, 295, 297, 298, 301, 313, 320, 321, 323, 325, 329, 335, 340, 342, 356, 376, 377, 379, 380, 387, 391, 399, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 431, 435, 439, 440, 442, 443, 444, 446, 447, 448, 449, 452, 456, 459, 460, 461, 465, 472, 475, 485, 487, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 504, 513, 514, 515, 519, 526]
