# `featurize_races_distance.ipynb`

### Author: Anthony Hein

#### Last updated: 10/20/2021

# Overview:

We have decided that we also want to trim the dataset to only include the most popular distances and featurize this.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `races_clean_augment_clean_v2.csv`

In [16]:
races_featurized_weather = pd.read_csv(f"{BASE_DIR}/data/csv/races_featurized_weather.csv", low_memory=False) 
races_featurized_weather.head()

Unnamed: 0,rid,metric,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,...,pressure_level_1,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4
0,302858,3821.0,1.219263,2.2,1012.7,0.0,82,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,291347,5229.0,1.218049,8.1,992.8,0.0,79,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,377929,1609.0,1.204927,10.1,996.7,0.0,76,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,275117,2011.0,1.083838,15.8,1030.1,0.0,53,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,66511,1810.0,1.077871,16.3,1022.9,0.0,53,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [17]:
races_featurized_weather.shape

(19248, 86)

In [18]:
races_featurized_distance = races_featurized_weather.copy()
races_featurized_distance.head()

Unnamed: 0,rid,metric,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,...,pressure_level_1,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4
0,302858,3821.0,1.219263,2.2,1012.7,0.0,82,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,291347,5229.0,1.218049,8.1,992.8,0.0,79,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,377929,1609.0,1.204927,10.1,996.7,0.0,76,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,275117,2011.0,1.083838,15.8,1030.1,0.0,53,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,66511,1810.0,1.077871,16.3,1022.9,0.0,53,0,0,0,...,0,0,1,0,0,0,1,0,0,0


---

## Featurize Distance

In [19]:
races_featurized_distance['metric'].value_counts()

1407.0    2979
1609.0    2627
3218.0    2037
1206.0    1747
2413.0    1709
2011.0    1388
1005.0    1358
2111.5     801
2815.0     543
1810.0     539
1709.5     452
3620.0     310
1910.5     295
1507.5     280
1306.5     275
3419.0     268
2614.0     235
4022.0     201
2212.0     195
3318.5     170
2513.5     133
2312.5     128
3821.0     119
4827.0      67
1105.5      55
4122.5      49
4424.0      45
3519.5      31
3921.5      29
5028.0      26
3016.0      25
3720.5      19
4223.0      18
5128.5      14
5631.0      11
2714.5      10
4625.0       9
4725.5       9
3116.5       8
4323.5       8
4927.5       7
5329.5       5
6637.0       4
4524.5       2
6838.0       2
5229.0       2
6436.0       2
6737.5       1
2915.5       1
Name: metric, dtype: int64

We will take the top 5.

In [20]:
lst = [1407.0, 1609.0, 3218.0, 1206.0, 2413.0]

In [21]:
races_featurized_distance = races_featurized_distance[races_featurized_distance['metric'].isin(lst)]
races_featurized_distance

Unnamed: 0,rid,metric,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,...,pressure_level_1,pressure_level_2,pressure_level_3,pressure_level_4,is_raining,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4
2,377929,1609.0,1.204927,10.1,996.7,0.0,76,0,0,0,...,0,1,0,0,0,0,0,0,1,0
6,353432,1407.0,1.178877,14.0,1004.5,0.0,94,0,0,0,...,0,1,0,0,0,0,0,0,0,1
8,326282,1206.0,1.153438,15.0,1012.0,0.0,76,0,0,0,...,0,0,1,0,0,0,0,0,1,0
10,214002,1609.0,1.135714,14.5,1011.9,0.0,78,0,0,0,...,0,0,1,0,0,0,0,0,1,0
14,266884,1407.0,1.245532,20.7,1015.0,0.0,69,0,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19242,176467,3218.0,1.447925,5.2,987.4,0.0,80,0,0,1,...,1,0,0,0,0,0,0,0,1,0
19243,227342,3218.0,1.540875,4.0,1003.5,0.4,83,0,0,0,...,0,1,0,0,1,0,0,0,1,0
19244,243638,3218.0,1.576670,2.5,988.8,0.0,93,0,0,0,...,1,0,0,0,0,0,0,0,0,1
19246,360255,3218.0,1.296711,2.7,1021.3,0.0,88,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [22]:
races_featurized_distance.shape

(11099, 86)

In [23]:
for idx in range(len(lst)):
    colname = "metric_" + str(idx)
    in_bucket = races_featurized_distance['metric'] == lst[idx]
    races_featurized_distance[colname] = [int(elt) for elt in in_bucket]

races_featurized_distance.head(10)

Unnamed: 0,rid,metric,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,...,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,metric_0,metric_1,metric_2,metric_3,metric_4
2,377929,1609.0,1.204927,10.1,996.7,0.0,76,0,0,0,...,0,0,0,1,0,0,1,0,0,0
6,353432,1407.0,1.178877,14.0,1004.5,0.0,94,0,0,0,...,0,0,0,0,1,1,0,0,0,0
8,326282,1206.0,1.153438,15.0,1012.0,0.0,76,0,0,0,...,0,0,0,1,0,0,0,0,1,0
10,214002,1609.0,1.135714,14.5,1011.9,0.0,78,0,0,0,...,0,0,0,1,0,0,1,0,0,0
14,266884,1407.0,1.245532,20.7,1015.0,0.0,69,0,0,0,...,0,0,1,0,0,1,0,0,0,0
15,48165,1206.0,1.097269,18.0,1028.6,0.0,66,0,0,0,...,0,0,1,0,0,0,0,0,1,0
17,333629,2413.0,1.13956,12.4,1023.1,0.0,86,0,0,0,...,0,0,0,0,1,0,0,0,0,1
21,50025,1407.0,1.376904,14.3,1015.6,0.0,63,0,0,0,...,0,0,1,0,0,1,0,0,0,0
22,401124,1407.0,1.202464,14.8,1022.3,0.0,86,0,0,0,...,0,0,0,0,1,1,0,0,0,0
24,170309,3218.0,1.107718,6.4,1001.6,0.0,78,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [24]:
races_featurized_distance = races_featurized_distance.drop(columns=['metric'])
races_featurized_distance

Unnamed: 0,rid,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,course__Cork,...,rhum_level_0,rhum_level_1,rhum_level_2,rhum_level_3,rhum_level_4,metric_0,metric_1,metric_2,metric_3,metric_4
2,377929,1.204927,10.1,996.7,0.0,76,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
6,353432,1.178877,14.0,1004.5,0.0,94,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
8,326282,1.153438,15.0,1012.0,0.0,76,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
10,214002,1.135714,14.5,1011.9,0.0,78,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
14,266884,1.245532,20.7,1015.0,0.0,69,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19242,176467,1.447925,5.2,987.4,0.0,80,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
19243,227342,1.540875,4.0,1003.5,0.4,83,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
19244,243638,1.576670,2.5,988.8,0.0,93,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
19246,360255,1.296711,2.7,1021.3,0.0,88,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


---

## Save Dataframes

In [25]:
races_featurized_distance.to_csv(f"{BASE_DIR}/data/csv/races_featurized_distance.csv", index=False)

---