# `augment_races.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

At this point, the data has been cleaned and trimmed. We will not augment the races dataset with the features we have collected and maintained in dictionaries in `utils` for easy access.

---

## Setup

In [1]:
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

In [5]:
import sys

sys.path.append(f'{BASE_DIR}/utils/')

from rid_to_margin import RID_TO_MARGIN
from rid_to_runners import RID_TO_RUNNERS
from rid_to_station_ie import RID_TO_STATION_IE
from rid_to_weather import RID_TO_WEATHER

---

## Load `races_aticnmidg.csv`

In [3]:
races_aticnmidg = pd.read_csv(f"{BASE_DIR}/data/csv/races_aticnmidg.csv", low_memory=False) 
races_aticnmidg.head()

Unnamed: 0,rid,course,time,date,hurdles,prizes,winningTime,metric,countryCode,ncond,class
0,302858,Thurles (IRE),01:15,97/01/09,,[],277.2,3821.0,IE,1,0
1,291347,Punchestown (IRE),03:40,97/02/16,,[],447.2,5229.0,IE,5,0
2,377929,Leopardstown (IRE),03:00,97/05/11,,[],106.4,1609.0,IE,4,0
3,275117,Curragh (IRE),03:35,97/05/25,,[],125.9,2011.0,IE,4,0
4,66511,Leopardstown (IRE),04:30,97/06/02,,[],116.3,1810.0,IE,1,0


In [4]:
races_aticnmidg.shape

(19271, 11)

---

## Make DFs for Each Dictionary

In [22]:
rename_cols = {
    'index': 'rid',
    0: 'margin',
}

df_margin = pd.DataFrame.from_dict(RID_TO_MARGIN, orient='index').reset_index().rename(columns=rename_cols)
df_margin.head()

Unnamed: 0,rid,margin
0,267255,1.168254
1,297570,1.256241
2,334421,1.195723
3,366304,1.247935
4,13063,1.171156


In [23]:
rename_cols = {
    'index': 'rid',
    0: 'runners',
}

df_runners = pd.DataFrame.from_dict(RID_TO_RUNNERS, orient='index').reset_index().rename(columns=rename_cols)
df_runners.head()

Unnamed: 0,rid,runners
0,267255,6
1,297570,11
2,334421,10
3,366304,10
4,13063,9


In [24]:
rename_cols = {
    'index': 'rid',
}

df_weather = pd.DataFrame.from_dict(RID_TO_WEATHER, orient='index').reset_index().rename(columns=rename_cols)
df_weather.head()

Unnamed: 0,rid,date,temp,msl,rain,rhum,Station number
0,302858,1/9/97 13:00,2.2,1012.7,0.0,82,4919
1,291347,2/16/97 16:00,8.1,992.8,0.0,79,3723
2,377929,5/11/97 15:00,10.1,996.7,0.0,76,532
3,275117,5/25/97 16:00,15.8,1030.1,0.0,53,3723
4,66511,6/2/97 17:00,16.3,1022.9,0.0,53,532


In [28]:
# we don't actually need date here
df_weather = df_weather.drop(columns=['date'])

---

## Inner Merge with `races_aticnmidg`

In [30]:
races_aticnmidg = races_aticnmidg.merge(df_margin, how='inner', left_on='rid', right_on='rid') \
                                 .merge(df_runners, how='inner', left_on='rid', right_on='rid') \
                                 .merge(df_weather, how='inner', left_on='rid', right_on='rid')
races_aticnmidg.head()

Unnamed: 0,rid,course,time,date,hurdles,prizes,winningTime,metric,countryCode,ncond,class,margin,runners,temp,msl,rain,rhum,Station number
0,302858,Thurles (IRE),01:15,97/01/09,,[],277.2,3821.0,IE,1,0,1.219263,6,2.2,1012.7,0.0,82,4919
1,291347,Punchestown (IRE),03:40,97/02/16,,[],447.2,5229.0,IE,5,0,1.218049,9,8.1,992.8,0.0,79,3723
2,377929,Leopardstown (IRE),03:00,97/05/11,,[],106.4,1609.0,IE,4,0,1.204927,5,10.1,996.7,0.0,76,532
3,275117,Curragh (IRE),03:35,97/05/25,,[],125.9,2011.0,IE,4,0,1.083838,5,15.8,1030.1,0.0,53,3723
4,66511,Leopardstown (IRE),04:30,97/06/02,,[],116.3,1810.0,IE,1,0,1.077871,5,16.3,1022.9,0.0,53,532


---

## Save Dataframes

In [31]:
races_aticnmidg.to_csv(f"{BASE_DIR}/data/csv/races_clean_augment.csv", index=False)

---