#### Complete data extraction from ITU API

In [2]:
import requests
import pandas as pd
import os
from collections import defaultdict
import re
import numpy as np
from scipy.special import binom
# Path hack.
import sys, os
sys.path.insert(0, os.path.abspath('..'))
# import heloer functions for cleaning df
from src.helpers import *

### TODO
- have all years of data read in here
- have all years data cleaned and organized in a separate notebook
- get rid of mixed relay
- Hamburg MALE 2018 is missing!

### To get all race results, each race needs a `program ID` and an `event ID`

In [3]:
password = os.environ['ITU_API']

# THIS GIVES EVENT ID
url = "https://api.triathlon.org/v1/events?category_id=351&start_date=2011-01-01&end_date=2019-12-31"

headers = {'apikey': password}

response = requests.request("GET", url, headers=headers)

races_2010s = (response.json())['data']

#### Create all the required directories to hold the races
- only need to run once

In [6]:
dirs = ["2012_races","2013_races", "2014_races", 
        "2015_races", "2016_races","2017_races", "2018_races", "2019_races"]
for path in dirs:
    path1 = path+"/races"
    path2 = path+"/races_clean"
    path3 = path="/ovo_races"
    for path in [path1, path2, path3]:
        os.mkdir(path)

#### Program ID for all WTS races  - do this once

In [4]:
# THIS GIVES PROGRAM ID for ALL WTS races
url = "https://api.triathlon.org/v1/statistics/results?analysis=count_unique&target_property=event.name&group_by=event.name|program.id|program.name"
headers = {'apikey': password}
response = requests.request("GET", url, headers=headers)
all_events  = response.json()['data']['result']

In [38]:
year_list = [year for year in range(2009,2020)]
period = 1
for year in year_list:
    if period > 1:
        period +=5
    # get the Event ID of races from that year!
    url = f"https://api.triathlon.org/v1/events?category_id=351&start_date={year}-01-01&end_date={year}-12-31"
    headers = {'apikey': password}

    response = requests.request("GET", url, headers=headers)
    races = (response.json())['data']
    
    # dict to store event and program ID of all races during the season
    season_dict = defaultdict(list)
    # /get race name and race event ID
    for race in races:
        if len(re.findall('Relay', race['event_title'])) == 0:
            season_dict[race['event_title']].append(race['event_id'])
    # map race event ID with program ID
    for event in all_events:
        if (event['event.name']) in season_dict.keys():
            season_dict[event['event.name']].append((event['program.id'], event['program.name']))
            
    # have program and event ID from all races, so can get and store all results
    # get the results each race in our season 
    for key, values in season_dict.items():
        if (len(values) == 3):
            # race location
            race_name = re.findall('\s(\w+)$', key)[0]    
            if (race_name == 'Triathlon'):
                race_name = key.split(' ')[1]
            # event ID
            event_id = values[0]   

            for program_id in (values[1], values[2]):
                prog_id = program_id[0]
                gender = 'female' if program_id[1] == 'Elite Women' else 'male'

                # get those specific results
                url = f"https://api.triathlon.org/v1/events/{event_id}/programs/{prog_id}/results"
                response = requests.request("GET", url, headers=headers)
                race_result = response.json()['data']['results']

                # store all the attributes of each race in a dictionary of lists
                race = defaultdict(list)
                for athlete in race_result:
                    race['program_id'].append(prog_id)
                    race['athlete_id'].append(athlete['athlete_id'])
                    race['athlete_first'].append(athlete['athlete_first'])
                    race['athlete_last'].append(athlete['athlete_last'])
                    race['nationality'].append(athlete['athlete_noc'])
                    race['start_number'].append(athlete['start_num'])
                    race['swim'].append(athlete['splits'][0])
                    race['t1'].append(athlete['splits'][1])
                    race['bike'].append(athlete['splits'][2])
                    race['t2'].append(athlete['splits'][3])
                    race['run'].append(athlete['splits'][4])
                    race['position'].append(athlete['position'])
                    race['total_time'].append(athlete['total_time'])

                # turn the dict into a df
                race_df = pd.DataFrame(race)
                
                # clean df
                clean_df = clean_dataframe(race_df, ['swim', 't1', 'bike', 't2', 'total_time'])
                
                # ovo DF (for use in glicko func)
                ovo_df = one_vs_one(race_df, int(period))
            
                # write all to file
                race_df.to_csv(f"{year}_races/races/{race_name}_{gender}.csv", index = False)
                clean_df.to_csv(f"{year}_races/races_clean/{race_name}_{gender}.csv", index = False)
                ovo_df.to_csv(f"{year}_races/ovo_races/{race_name}_{gender}.csv", index = False)       
                
                # update time period of race
                # adding half so every two races (M and F) iterates by 1
                period += 0.5

In [39]:
def test_time_period():
    """
    Check a few cases to make sure working
    """
    gc09_ovo_male = pd.read_csv("2009_races/ovo_races/Coast_male.csv")
    gc09_ovo_fmale = pd.read_csv("2009_races/ovo_races/Coast_female.csv")
    syd10_ovo_male = pd.read_csv("2010_races/ovo_races/Sydney_male.csv")
    syd10_ovo_female = pd.read_csv("2010_races/ovo_races/Sydney_female.csv")
    syd11_ovo_male = pd.read_csv("2011_races/ovo_races/Sydney_male.csv")
    syd12_ovo_male = pd.read_csv("2012_races/ovo_races/Sydney_male.csv")
    
    
    assert(gc09_ovo_male.iloc[0][0] == 8), "Time period incorrect"
    assert(gc09_ovo_female.iloc[0][0] == 8), "Time period incorrect"
    assert(syd10_ovo_male.iloc[0][0] == 14), "Time period incorrect"
    assert(syd10_ovo_female.iloc[0][0] == 14), "Time period incorrect"
    assert(syd11_ovo_male.iloc[0][0] == 26), "Time period incorrect"
    assert(syd12_ovo_male.iloc[0][0] == 39), "Time period incorrect"

In [41]:
test_time_period()