# `trim_augment_races.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

We can once again trim the dataset since we have since made several simiplications from the last time it was trimmed. 

---

## Setup

In [11]:
from datetime import datetime
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `races_aticnmidg.csv`

In [4]:
races_clean_augment = pd.read_csv(f"{BASE_DIR}/data/csv/races_clean_augment.csv", low_memory=False) 
races_clean_augment.head()

Unnamed: 0,rid,course,time,date,hurdles,prizes,winningTime,metric,countryCode,ncond,class,margin,runners,temp,msl,rain,rhum,Station number
0,302858,Thurles (IRE),01:15,97/01/09,,[],277.2,3821.0,IE,1,0,1.219263,6,2.2,1012.7,0.0,82,4919
1,291347,Punchestown (IRE),03:40,97/02/16,,[],447.2,5229.0,IE,5,0,1.218049,9,8.1,992.8,0.0,79,3723
2,377929,Leopardstown (IRE),03:00,97/05/11,,[],106.4,1609.0,IE,4,0,1.204927,5,10.1,996.7,0.0,76,532
3,275117,Curragh (IRE),03:35,97/05/25,,[],125.9,2011.0,IE,4,0,1.083838,5,15.8,1030.1,0.0,53,3723
4,66511,Leopardstown (IRE),04:30,97/06/02,,[],116.3,1810.0,IE,1,0,1.077871,5,16.3,1022.9,0.0,53,532


In [5]:
races_clean_augment.shape

(19271, 18)

---

## Drop Unecessary Columns

In particular, we no longer need `hurdles` (since they are all `NaN`), we have realized that `prizes` is unecessary (this is only needed for a potential Monte Carlo simulation), we no longer need `countryCode` because it is all Ireland, and we can drop `class` since there are only 11 instances of a class not equal to 0.

In [7]:
races_clean_augment = races_clean_augment[races_clean_augment['class'] == 0]
races_clean_augment

Unnamed: 0,rid,course,time,date,hurdles,prizes,winningTime,metric,countryCode,ncond,class,margin,runners,temp,msl,rain,rhum,Station number
0,302858,Thurles (IRE),01:15,97/01/09,,[],277.2,3821.0,IE,1,0,1.219263,6,2.2,1012.7,0.0,82,4919
1,291347,Punchestown (IRE),03:40,97/02/16,,[],447.2,5229.0,IE,5,0,1.218049,9,8.1,992.8,0.0,79,3723
2,377929,Leopardstown (IRE),03:00,97/05/11,,[],106.4,1609.0,IE,4,0,1.204927,5,10.1,996.7,0.0,76,532
3,275117,Curragh (IRE),03:35,97/05/25,,[],125.9,2011.0,IE,4,0,1.083838,5,15.8,1030.1,0.0,53,3723
4,66511,Leopardstown (IRE),04:30,97/06/02,,[],116.3,1810.0,IE,1,0,1.077871,5,16.3,1022.9,0.0,53,532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19266,227342,Cork (IRE),03:30,99/12/12,,[],248.2,3218.0,IE,9,0,1.540875,14,4.0,1003.5,0.4,83,3904
19267,243638,Down Royal (IRE),03:30,99/12/27,,[],252.0,3218.0,IE,9,0,1.576670,14,2.5,988.8,0.0,93,2437
19268,44932,Clonmel (IRE),03:35,99/12/27,,[],296.2,3620.0,IE,12,0,1.595269,12,3.5,990.3,0.0,85,3613
19269,360255,Leopardstown (IRE),03:45,99/12/29,,[],253.0,3218.0,IE,11,0,1.296711,10,2.7,1021.3,0.0,88,532


In [8]:
races_clean_augment = races_clean_augment.drop(columns=['hurdles', 'prizes', 'countryCode', 'class'])
races_clean_augment

Unnamed: 0,rid,course,time,date,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number
0,302858,Thurles (IRE),01:15,97/01/09,277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919
1,291347,Punchestown (IRE),03:40,97/02/16,447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723
2,377929,Leopardstown (IRE),03:00,97/05/11,106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532
3,275117,Curragh (IRE),03:35,97/05/25,125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723
4,66511,Leopardstown (IRE),04:30,97/06/02,116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19266,227342,Cork (IRE),03:30,99/12/12,248.2,3218.0,9,1.540875,14,4.0,1003.5,0.4,83,3904
19267,243638,Down Royal (IRE),03:30,99/12/27,252.0,3218.0,9,1.576670,14,2.5,988.8,0.0,93,2437
19268,44932,Clonmel (IRE),03:35,99/12/27,296.2,3620.0,12,1.595269,12,3.5,990.3,0.0,85,3613
19269,360255,Leopardstown (IRE),03:45,99/12/29,253.0,3218.0,11,1.296711,10,2.7,1021.3,0.0,88,532


---

## Merge `date` and `time`

Let's also make `date` and `time` one column which is just `datetime`, and we will save this in a more standard format to prevent going back-and-forth.

In [13]:
races_clean_augment['datetime'] = [
    datetime.strptime(row['date'] + ' ' + row['time'] + ' PM', '%y/%m/%d %I:%M %p')
    for idx, row
    in races_clean_augment.iterrows()
]
races_clean_augment.head()

Unnamed: 0,rid,course,time,date,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
0,302858,Thurles (IRE),01:15,97/01/09,277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00
1,291347,Punchestown (IRE),03:40,97/02/16,447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00
2,377929,Leopardstown (IRE),03:00,97/05/11,106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
3,275117,Curragh (IRE),03:35,97/05/25,125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00
4,66511,Leopardstown (IRE),04:30,97/06/02,116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00


Now, we can drop `date` and `time`.

In [14]:
races_clean_augment = races_clean_augment.drop(columns=['date', 'time'])
races_clean_augment

Unnamed: 0,rid,course,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
0,302858,Thurles (IRE),277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00
1,291347,Punchestown (IRE),447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00
2,377929,Leopardstown (IRE),106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
3,275117,Curragh (IRE),125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00
4,66511,Leopardstown (IRE),116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19266,227342,Cork (IRE),248.2,3218.0,9,1.540875,14,4.0,1003.5,0.4,83,3904,1999-12-12 15:30:00
19267,243638,Down Royal (IRE),252.0,3218.0,9,1.576670,14,2.5,988.8,0.0,93,2437,1999-12-27 15:30:00
19268,44932,Clonmel (IRE),296.2,3620.0,12,1.595269,12,3.5,990.3,0.0,85,3613,1999-12-27 15:35:00
19269,360255,Leopardstown (IRE),253.0,3218.0,11,1.296711,10,2.7,1021.3,0.0,88,532,1999-12-29 15:45:00


---

## Save Dataframes

In [15]:
races_clean_augment.to_csv(f"{BASE_DIR}/data/csv/races_clean_augment_clean.csv", index=False)

---