# `featurize_races.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

At this point, the data has been cleaned and trimmed. We will not augment the races dataset with the features we have Now, we can featurize the data by transforming more categorical data to bins or one-hot encoded columns.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `races_clean_augment_clean.csv`

In [3]:
races_clean_augment_clean = pd.read_csv(f"{BASE_DIR}/data/csv/races_clean_augment_clean.csv", low_memory=False) 
races_clean_augment_clean.head()

Unnamed: 0,rid,course,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
0,302858,Thurles (IRE),277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00
1,291347,Punchestown (IRE),447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00
2,377929,Leopardstown (IRE),106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
3,275117,Curragh (IRE),125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00
4,66511,Leopardstown (IRE),116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00


In [4]:
races_clean_augment_clean.shape

(19260, 13)

In [5]:
races_clean_augment_clean.columns

Index(['rid', 'course', 'winningTime', 'metric', 'ncond', 'margin', 'runners',
       'temp', 'msl', 'rain', 'rhum', 'Station number', 'datetime'],
      dtype='object')

In [6]:
races_featurized = races_clean_augment_clean.copy()
races_featurized.head()

Unnamed: 0,rid,course,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,Station number,datetime
0,302858,Thurles (IRE),277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00
1,291347,Punchestown (IRE),447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00
2,377929,Leopardstown (IRE),106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,532,1997-05-11 15:00:00
3,275117,Curragh (IRE),125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00
4,66511,Leopardstown (IRE),116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00


---

## One-Hot Encode `course`

In [7]:
races_featurized['course'].value_counts()

Dundalk (AW) (IRE)    3466
Curragh (IRE)         2091
Leopardstown (IRE)    1877
Cork (IRE)             888
Naas (IRE)             878
Gowran Park (IRE)      842
Tipperary (IRE)        818
Navan (IRE)            680
Fairyhouse (IRE)       664
Galway (IRE)           633
Killarney (IRE)        627
Down Royal (IRE)       606
Roscommon (IRE)        572
Limerick (IRE)         567
Listowel (IRE)         478
Tramore (IRE)          453
Sligo (IRE)            403
Ballinrobe (IRE)       358
Bellewstown (IRE)      347
Clonmel (IRE)          311
Tralee (IRE)           303
Punchestown (IRE)      290
Wexford (RH) (IRE)     266
Downpatrick (IRE)      254
Thurles (IRE)          159
Laytown (IRE)          115
Wexford (IRE)          114
Kilbeggan (IRE)        103
Dundalk (IRE)           97
Name: course, dtype: int64

One weird thing here is that Dundalk appears twice because there is the false (AW) country code on one and not the other. Let's remove these country codes.

In [8]:
re.sub("\(.*\)", "", 'Dundalk (AW) (IRE)').strip()

'Dundalk'

In [9]:
races_featurized['course'] = races_featurized['course'].map(lambda x: re.sub("\(.*\)", "", x).strip())
races_featurized['course'].value_counts()

Dundalk         3563
Curragh         2091
Leopardstown    1877
Cork             888
Naas             878
Gowran Park      842
Tipperary        818
Navan            680
Fairyhouse       664
Galway           633
Killarney        627
Down Royal       606
Roscommon        572
Limerick         567
Listowel         478
Tramore          453
Sligo            403
Wexford          380
Ballinrobe       358
Bellewstown      347
Clonmel          311
Tralee           303
Punchestown      290
Downpatrick      254
Thurles          159
Laytown          115
Kilbeggan        103
Name: course, dtype: int64

In [10]:
len(races_featurized['course'].unique())

27

Great, that solved it. We can proceed with the featurization.

In [11]:
races_featurized = races_featurized.join(pd.get_dummies(races_featurized['course'], prefix='course_'))
races_featurized.head()

Unnamed: 0,rid,course,winningTime,metric,ncond,margin,runners,temp,msl,rain,...,course__Naas,course__Navan,course__Punchestown,course__Roscommon,course__Sligo,course__Thurles,course__Tipperary,course__Tralee,course__Tramore,course__Wexford
0,302858,Thurles,277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,...,0,0,0,0,0,1,0,0,0,0
1,291347,Punchestown,447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,...,0,0,1,0,0,0,0,0,0,0
2,377929,Leopardstown,106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,...,0,0,0,0,0,0,0,0,0,0
3,275117,Curragh,125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,...,0,0,0,0,0,0,0,0,0,0
4,66511,Leopardstown,116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
races_featurized = races_featurized.drop(columns=['course'])
races_featurized

Unnamed: 0,rid,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,...,course__Naas,course__Navan,course__Punchestown,course__Roscommon,course__Sligo,course__Thurles,course__Tipperary,course__Tralee,course__Tramore,course__Wexford
0,302858,277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,...,0,0,0,0,0,1,0,0,0,0
1,291347,447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,...,0,0,1,0,0,0,0,0,0,0
2,377929,106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,...,0,0,0,0,0,0,0,0,0,0
3,275117,125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,...,0,0,0,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,9,1.540875,14,4.0,1003.5,0.4,83,...,0,0,0,0,0,0,0,0,0,0
19256,243638,252.0,3218.0,9,1.576670,14,2.5,988.8,0.0,93,...,0,0,0,0,0,0,0,0,0,0
19257,44932,296.2,3620.0,12,1.595269,12,3.5,990.3,0.0,85,...,0,0,0,0,0,0,0,0,0,0
19258,360255,253.0,3218.0,11,1.296711,10,2.7,1021.3,0.0,88,...,0,0,0,0,0,0,0,0,0,0


---

## One-Hot Encode `ncond`

In [13]:
races_featurized['ncond'].value_counts()

1     3820
0     3577
2     3139
5     1988
9     1553
6     1397
4     1159
11    1055
12     832
8      674
10      62
17       4
Name: ncond, dtype: int64

In [14]:
len(races_featurized['ncond'].unique())

12

We will drop all rows with `ncond = 17` since there aren't enough to substantiate a feature.

In [15]:
races_featurized = races_featurized[races_featurized['ncond'] != 17]
races_featurized

Unnamed: 0,rid,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,...,course__Naas,course__Navan,course__Punchestown,course__Roscommon,course__Sligo,course__Thurles,course__Tipperary,course__Tralee,course__Tramore,course__Wexford
0,302858,277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,...,0,0,0,0,0,1,0,0,0,0
1,291347,447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,...,0,0,1,0,0,0,0,0,0,0
2,377929,106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,...,0,0,0,0,0,0,0,0,0,0
3,275117,125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,...,0,0,0,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,9,1.540875,14,4.0,1003.5,0.4,83,...,0,0,0,0,0,0,0,0,0,0
19256,243638,252.0,3218.0,9,1.576670,14,2.5,988.8,0.0,93,...,0,0,0,0,0,0,0,0,0,0
19257,44932,296.2,3620.0,12,1.595269,12,3.5,990.3,0.0,85,...,0,0,0,0,0,0,0,0,0,0
19258,360255,253.0,3218.0,11,1.296711,10,2.7,1021.3,0.0,88,...,0,0,0,0,0,0,0,0,0,0


Now, one-hot encode the remainder.

In [16]:
races_featurized = races_featurized.join(pd.get_dummies(races_featurized['ncond'], prefix='ncond_'))
races_featurized.head()

Unnamed: 0,rid,winningTime,metric,ncond,margin,runners,temp,msl,rain,rhum,...,ncond__1,ncond__2,ncond__4,ncond__5,ncond__6,ncond__8,ncond__9,ncond__10,ncond__11,ncond__12
0,302858,277.2,3821.0,1,1.219263,6,2.2,1012.7,0.0,82,...,1,0,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,5,1.218049,9,8.1,992.8,0.0,79,...,0,0,0,1,0,0,0,0,0,0
2,377929,106.4,1609.0,4,1.204927,5,10.1,996.7,0.0,76,...,0,0,1,0,0,0,0,0,0,0
3,275117,125.9,2011.0,4,1.083838,5,15.8,1030.1,0.0,53,...,0,0,1,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1,1.077871,5,16.3,1022.9,0.0,53,...,1,0,0,0,0,0,0,0,0,0


In [17]:
races_featurized = races_featurized.drop(columns=['ncond'])
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,runners,temp,msl,rain,rhum,Station number,...,ncond__1,ncond__2,ncond__4,ncond__5,ncond__6,ncond__8,ncond__9,ncond__10,ncond__11,ncond__12
0,302858,277.2,3821.0,1.219263,6,2.2,1012.7,0.0,82,4919,...,1,0,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,1.218049,9,8.1,992.8,0.0,79,3723,...,0,0,0,1,0,0,0,0,0,0
2,377929,106.4,1609.0,1.204927,5,10.1,996.7,0.0,76,532,...,0,0,1,0,0,0,0,0,0,0
3,275117,125.9,2011.0,1.083838,5,15.8,1030.1,0.0,53,3723,...,0,0,1,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1.077871,5,16.3,1022.9,0.0,53,532,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,14,4.0,1003.5,0.4,83,3904,...,0,0,0,0,0,0,1,0,0,0
19256,243638,252.0,3218.0,1.576670,14,2.5,988.8,0.0,93,2437,...,0,0,0,0,0,0,1,0,0,0
19257,44932,296.2,3620.0,1.595269,12,3.5,990.3,0.0,85,3613,...,0,0,0,0,0,0,0,0,0,1
19258,360255,253.0,3218.0,1.296711,10,2.7,1021.3,0.0,88,532,...,0,0,0,0,0,0,0,0,1,0


---

## One-Hot Encode `runners`

In [18]:
races_featurized['runners'].value_counts()

14    3382
13    2111
10    1982
9     1958
12    1910
8     1846
11    1810
7     1665
6     1186
5      845
4      412
3      141
2        8
Name: runners, dtype: int64

Similarly, drop races with only 2 runners because this class is too small and only adds noise.

In [19]:
races_featurized = races_featurized[races_featurized['runners'] != 2]
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,runners,temp,msl,rain,rhum,Station number,...,ncond__1,ncond__2,ncond__4,ncond__5,ncond__6,ncond__8,ncond__9,ncond__10,ncond__11,ncond__12
0,302858,277.2,3821.0,1.219263,6,2.2,1012.7,0.0,82,4919,...,1,0,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,1.218049,9,8.1,992.8,0.0,79,3723,...,0,0,0,1,0,0,0,0,0,0
2,377929,106.4,1609.0,1.204927,5,10.1,996.7,0.0,76,532,...,0,0,1,0,0,0,0,0,0,0
3,275117,125.9,2011.0,1.083838,5,15.8,1030.1,0.0,53,3723,...,0,0,1,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1.077871,5,16.3,1022.9,0.0,53,532,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,14,4.0,1003.5,0.4,83,3904,...,0,0,0,0,0,0,1,0,0,0
19256,243638,252.0,3218.0,1.576670,14,2.5,988.8,0.0,93,2437,...,0,0,0,0,0,0,1,0,0,0
19257,44932,296.2,3620.0,1.595269,12,3.5,990.3,0.0,85,3613,...,0,0,0,0,0,0,0,0,0,1
19258,360255,253.0,3218.0,1.296711,10,2.7,1021.3,0.0,88,532,...,0,0,0,0,0,0,0,0,1,0


One-hot encode the remainder.

In [20]:
races_featurized = races_featurized.join(pd.get_dummies(races_featurized['runners'], prefix='runners_'))
races_featurized.head()

Unnamed: 0,rid,winningTime,metric,margin,runners,temp,msl,rain,rhum,Station number,...,runners__5,runners__6,runners__7,runners__8,runners__9,runners__10,runners__11,runners__12,runners__13,runners__14
0,302858,277.2,3821.0,1.219263,6,2.2,1012.7,0.0,82,4919,...,0,1,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,1.218049,9,8.1,992.8,0.0,79,3723,...,0,0,0,0,1,0,0,0,0,0
2,377929,106.4,1609.0,1.204927,5,10.1,996.7,0.0,76,532,...,1,0,0,0,0,0,0,0,0,0
3,275117,125.9,2011.0,1.083838,5,15.8,1030.1,0.0,53,3723,...,1,0,0,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1.077871,5,16.3,1022.9,0.0,53,532,...,1,0,0,0,0,0,0,0,0,0


In [21]:
races_featurized = races_featurized.drop(columns=['runners'])
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,temp,msl,rain,rhum,Station number,datetime,...,runners__5,runners__6,runners__7,runners__8,runners__9,runners__10,runners__11,runners__12,runners__13,runners__14
0,302858,277.2,3821.0,1.219263,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00,...,0,1,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,1.218049,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00,...,0,0,0,0,1,0,0,0,0,0
2,377929,106.4,1609.0,1.204927,10.1,996.7,0.0,76,532,1997-05-11 15:00:00,...,1,0,0,0,0,0,0,0,0,0
3,275117,125.9,2011.0,1.083838,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00,...,1,0,0,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1.077871,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,4.0,1003.5,0.4,83,3904,1999-12-12 15:30:00,...,0,0,0,0,0,0,0,0,0,1
19256,243638,252.0,3218.0,1.576670,2.5,988.8,0.0,93,2437,1999-12-27 15:30:00,...,0,0,0,0,0,0,0,0,0,1
19257,44932,296.2,3620.0,1.595269,3.5,990.3,0.0,85,3613,1999-12-27 15:35:00,...,0,0,0,0,0,0,0,1,0,0
19258,360255,253.0,3218.0,1.296711,2.7,1021.3,0.0,88,532,1999-12-29 15:45:00,...,0,0,0,0,0,1,0,0,0,0


---

## One-Hot Encode `datetime`

To make datetime more digestable as a feature, we will do two things to encode this. First, we will encode the month, which implicitly uses the assumption that weather is similar/constant throughout a month. I feel like this is a fair assumption given how we talk about how cold or warm a _month_ is. Then, we will extract the year and leave it as its own variable. This helps a potential model capture any trends over time. In summary, we propose that anything a model may learn from a datetime is in fact some baseline knowledge about the month of the datetime plus some perturbation introduced by the year. To make this more concrete, this is similar to saying "December is very cold, but gets warmer as the years go on because of global warming."

In [22]:
races_featurized['month'] = races_featurized['datetime'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:00').month)
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,temp,msl,rain,rhum,Station number,datetime,...,runners__6,runners__7,runners__8,runners__9,runners__10,runners__11,runners__12,runners__13,runners__14,month
0,302858,277.2,3821.0,1.219263,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00,...,1,0,0,0,0,0,0,0,0,1
1,291347,447.2,5229.0,1.218049,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00,...,0,0,0,1,0,0,0,0,0,2
2,377929,106.4,1609.0,1.204927,10.1,996.7,0.0,76,532,1997-05-11 15:00:00,...,0,0,0,0,0,0,0,0,0,5
3,275117,125.9,2011.0,1.083838,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00,...,0,0,0,0,0,0,0,0,0,5
4,66511,116.3,1810.0,1.077871,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00,...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,4.0,1003.5,0.4,83,3904,1999-12-12 15:30:00,...,0,0,0,0,0,0,0,0,1,12
19256,243638,252.0,3218.0,1.576670,2.5,988.8,0.0,93,2437,1999-12-27 15:30:00,...,0,0,0,0,0,0,0,0,1,12
19257,44932,296.2,3620.0,1.595269,3.5,990.3,0.0,85,3613,1999-12-27 15:35:00,...,0,0,0,0,0,0,1,0,0,12
19258,360255,253.0,3218.0,1.296711,2.7,1021.3,0.0,88,532,1999-12-29 15:45:00,...,0,0,0,0,1,0,0,0,0,12


In [23]:
races_featurized['month'].value_counts()

8     3008
7     2884
6     2600
9     2123
5     2028
10    1734
4     1376
11    1036
3      782
12     660
1      522
2      495
Name: month, dtype: int64

In [24]:
races_featurized = races_featurized.join(pd.get_dummies(races_featurized['month'], prefix='month_'))
races_featurized.head()

Unnamed: 0,rid,winningTime,metric,margin,temp,msl,rain,rhum,Station number,datetime,...,month__3,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12
0,302858,277.2,3821.0,1.219263,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00,...,0,0,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,1.218049,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00,...,0,0,0,0,0,0,0,0,0,0
2,377929,106.4,1609.0,1.204927,10.1,996.7,0.0,76,532,1997-05-11 15:00:00,...,0,0,1,0,0,0,0,0,0,0
3,275117,125.9,2011.0,1.083838,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00,...,0,0,1,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1.077871,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00,...,0,0,0,1,0,0,0,0,0,0


In [25]:
races_featurized = races_featurized.drop(columns=['month'])
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,temp,msl,rain,rhum,Station number,datetime,...,month__3,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12
0,302858,277.2,3821.0,1.219263,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00,...,0,0,0,0,0,0,0,0,0,0
1,291347,447.2,5229.0,1.218049,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00,...,0,0,0,0,0,0,0,0,0,0
2,377929,106.4,1609.0,1.204927,10.1,996.7,0.0,76,532,1997-05-11 15:00:00,...,0,0,1,0,0,0,0,0,0,0
3,275117,125.9,2011.0,1.083838,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00,...,0,0,1,0,0,0,0,0,0,0
4,66511,116.3,1810.0,1.077871,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,4.0,1003.5,0.4,83,3904,1999-12-12 15:30:00,...,0,0,0,0,0,0,0,0,0,1
19256,243638,252.0,3218.0,1.576670,2.5,988.8,0.0,93,2437,1999-12-27 15:30:00,...,0,0,0,0,0,0,0,0,0,1
19257,44932,296.2,3620.0,1.595269,3.5,990.3,0.0,85,3613,1999-12-27 15:35:00,...,0,0,0,0,0,0,0,0,0,1
19258,360255,253.0,3218.0,1.296711,2.7,1021.3,0.0,88,532,1999-12-29 15:45:00,...,0,0,0,0,0,0,0,0,0,1


Now, append the year.

In [26]:
races_featurized['year'] = races_featurized['datetime'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:00').year)
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,temp,msl,rain,rhum,Station number,datetime,...,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,year
0,302858,277.2,3821.0,1.219263,2.2,1012.7,0.0,82,4919,1997-01-09 13:15:00,...,0,0,0,0,0,0,0,0,0,1997
1,291347,447.2,5229.0,1.218049,8.1,992.8,0.0,79,3723,1997-02-16 15:40:00,...,0,0,0,0,0,0,0,0,0,1997
2,377929,106.4,1609.0,1.204927,10.1,996.7,0.0,76,532,1997-05-11 15:00:00,...,0,1,0,0,0,0,0,0,0,1997
3,275117,125.9,2011.0,1.083838,15.8,1030.1,0.0,53,3723,1997-05-25 15:35:00,...,0,1,0,0,0,0,0,0,0,1997
4,66511,116.3,1810.0,1.077871,16.3,1022.9,0.0,53,532,1997-06-02 16:30:00,...,0,0,1,0,0,0,0,0,0,1997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,4.0,1003.5,0.4,83,3904,1999-12-12 15:30:00,...,0,0,0,0,0,0,0,0,1,1999
19256,243638,252.0,3218.0,1.576670,2.5,988.8,0.0,93,2437,1999-12-27 15:30:00,...,0,0,0,0,0,0,0,0,1,1999
19257,44932,296.2,3620.0,1.595269,3.5,990.3,0.0,85,3613,1999-12-27 15:35:00,...,0,0,0,0,0,0,0,0,1,1999
19258,360255,253.0,3218.0,1.296711,2.7,1021.3,0.0,88,532,1999-12-29 15:45:00,...,0,0,0,0,0,0,0,0,1,1999


We can drop the datetime column, this isn't a feature.

In [27]:
races_featurized = races_featurized.drop(columns=['datetime'])
races_featurized

Unnamed: 0,rid,winningTime,metric,margin,temp,msl,rain,rhum,Station number,course__Ballinrobe,...,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,year
0,302858,277.2,3821.0,1.219263,2.2,1012.7,0.0,82,4919,0,...,0,0,0,0,0,0,0,0,0,1997
1,291347,447.2,5229.0,1.218049,8.1,992.8,0.0,79,3723,0,...,0,0,0,0,0,0,0,0,0,1997
2,377929,106.4,1609.0,1.204927,10.1,996.7,0.0,76,532,0,...,0,1,0,0,0,0,0,0,0,1997
3,275117,125.9,2011.0,1.083838,15.8,1030.1,0.0,53,3723,0,...,0,1,0,0,0,0,0,0,0,1997
4,66511,116.3,1810.0,1.077871,16.3,1022.9,0.0,53,532,0,...,0,0,1,0,0,0,0,0,0,1997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,248.2,3218.0,1.540875,4.0,1003.5,0.4,83,3904,0,...,0,0,0,0,0,0,0,0,1,1999
19256,243638,252.0,3218.0,1.576670,2.5,988.8,0.0,93,2437,0,...,0,0,0,0,0,0,0,0,1,1999
19257,44932,296.2,3620.0,1.595269,3.5,990.3,0.0,85,3613,0,...,0,0,0,0,0,0,0,0,1,1999
19258,360255,253.0,3218.0,1.296711,2.7,1021.3,0.0,88,532,0,...,0,0,0,0,0,0,0,0,1,1999


---

## Drop Non-Feature Columns

The `winningTime` and `Station number` are not features. For the former, this is not something knonw at the start of the race and for the latter we have no reason to believe this will be useful since it was only a means to get the weather data.

In [28]:
races_featurized = races_featurized.drop(columns=['winningTime', 'Station number'])
races_featurized

Unnamed: 0,rid,metric,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,...,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,year
0,302858,3821.0,1.219263,2.2,1012.7,0.0,82,0,0,0,...,0,0,0,0,0,0,0,0,0,1997
1,291347,5229.0,1.218049,8.1,992.8,0.0,79,0,0,0,...,0,0,0,0,0,0,0,0,0,1997
2,377929,1609.0,1.204927,10.1,996.7,0.0,76,0,0,0,...,0,1,0,0,0,0,0,0,0,1997
3,275117,2011.0,1.083838,15.8,1030.1,0.0,53,0,0,0,...,0,1,0,0,0,0,0,0,0,1997
4,66511,1810.0,1.077871,16.3,1022.9,0.0,53,0,0,0,...,0,0,1,0,0,0,0,0,0,1997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19255,227342,3218.0,1.540875,4.0,1003.5,0.4,83,0,0,0,...,0,0,0,0,0,0,0,0,1,1999
19256,243638,3218.0,1.576670,2.5,988.8,0.0,93,0,0,0,...,0,0,0,0,0,0,0,0,1,1999
19257,44932,3620.0,1.595269,3.5,990.3,0.0,85,0,0,1,...,0,0,0,0,0,0,0,0,1,1999
19258,360255,3218.0,1.296711,2.7,1021.3,0.0,88,0,0,0,...,0,0,0,0,0,0,0,0,1,1999


## Save Dataframes

In [29]:
races_featurized.to_csv(f"{BASE_DIR}/data/csv/races_featurized.csv", index=False)

---