# `streamline_data_trim.ipynb`

### Author: Anthony Hein

#### Last updated: 11/3/2021

# Overview:

This notebook is written well after the actual data cleaning took place. The purpose of this notebook is to replicate the results of trimming data (i.e. removing unecessary columns) in a much cleaner fashion since it is known exactly what data will be used and so less avenues have to be explored in the process. Of course, this does not invalidate any work done previously and cannot be a substitute because that would otherwise reverse cause and effect. In other words, we are able to write this slimmer notebook precisely because we wrote the larger notebooks which made us knowledgeable about the data.

This is primarily for ease of reproduction by other users.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_selected.csv`

In [3]:
horses_selected = pd.read_csv(f"{BASE_DIR}/data/streamline/horses_selected.csv", low_memory=False) 
horses_selected.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place
0,302858,Kings Return,6.0,4.0,0.6,1,W P Mullins,D J Casey,1,,...,,,King's Ride,Browne's Return,Deep Run,6,1.219263,73,1.0,1.0
1,302858,Majestic Red I,6.0,5.0,0.047619,0,John Hackett,Conor O'Dwyer,2,8,...,,,Long Pond,Courtlough Lady,Giolla Mear,6,1.219263,73,0.0,1.0
2,302858,Clearly Canadian,6.0,2.0,0.166667,0,D T Hughes,G Cotter,3,1.5,...,,,Nordico,Over The Seas,North Summit,6,1.219263,71,0.0,0.0
3,302858,Bernestic Wonder,8.0,1.0,0.058824,0,E McNamara,J Old Jones,4,dist,...,,,Roselier,Miss Reindeer,Reindeer,6,1.219263,73,0.0,0.0
4,302858,Beauty's Pride,5.0,6.0,0.038462,0,J J Lennon,T Martin,5,dist,...,,,Noalto,Elena's Beauty,Tarqogan,6,1.219263,66,0.0,0.0


In [4]:
horses_selected.shape

(205138, 27)

In [5]:
horses_selected_trimmed = horses_selected.copy()
horses_selected_trimmed.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place
0,302858,Kings Return,6.0,4.0,0.6,1,W P Mullins,D J Casey,1,,...,,,King's Ride,Browne's Return,Deep Run,6,1.219263,73,1.0,1.0
1,302858,Majestic Red I,6.0,5.0,0.047619,0,John Hackett,Conor O'Dwyer,2,8,...,,,Long Pond,Courtlough Lady,Giolla Mear,6,1.219263,73,0.0,1.0
2,302858,Clearly Canadian,6.0,2.0,0.166667,0,D T Hughes,G Cotter,3,1.5,...,,,Nordico,Over The Seas,North Summit,6,1.219263,71,0.0,0.0
3,302858,Bernestic Wonder,8.0,1.0,0.058824,0,E McNamara,J Old Jones,4,dist,...,,,Roselier,Miss Reindeer,Reindeer,6,1.219263,73,0.0,0.0
4,302858,Beauty's Pride,5.0,6.0,0.038462,0,J J Lennon,T Martin,5,dist,...,,,Noalto,Elena's Beauty,Tarqogan,6,1.219263,66,0.0,0.0


---

## Load `races_selected_augment_with_weather.csv`

In [6]:
races_selected = pd.read_csv(f"{BASE_DIR}/data/streamline/races_selected_augment_with_weather.csv", low_memory=False) 
races_selected.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,station name,station lat,station lng,dist to station,station reading date,temp,msl,rain,rhum,station reading timedelta
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,BIRR,53.0525,-7.5325,45.288813,1/9/97 12:00,1.6,1012.4,0.0,87,15.0
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,CASEMENT,53.182,-6.262,24.477602,2/16/97 15:00,8.0,992.5,0.4,87,20.0
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,SHANNON AIRPORT,52.4125,-8.5505,63.534139,3/1/97 14:00,12.0,1003.5,0.0,73,0.0
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,CASEMENT,53.182,-6.262,24.477602,4/24/97 14:00,12.6,1011.9,0.0,72,20.0
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,CLONES,54.11,-7.14,50.368275,5/2/97 14:00,21.3,1021.4,0.0,44,135.0


In [7]:
races_selected.shape

(20574, 45)

In [8]:
races_selected_trimmed = races_selected.copy()
races_selected_trimmed.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,station name,station lat,station lng,dist to station,station reading date,temp,msl,rain,rhum,station reading timedelta
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,BIRR,53.0525,-7.5325,45.288813,1/9/97 12:00,1.6,1012.4,0.0,87,15.0
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,CASEMENT,53.182,-6.262,24.477602,2/16/97 15:00,8.0,992.5,0.4,87,20.0
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,SHANNON AIRPORT,52.4125,-8.5505,63.534139,3/1/97 14:00,12.0,1003.5,0.0,73,0.0
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,CASEMENT,53.182,-6.262,24.477602,4/24/97 14:00,12.6,1011.9,0.0,72,20.0
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,CLONES,54.11,-7.14,50.368275,5/2/97 14:00,21.3,1021.4,0.0,44,135.0


---

## Trim `horses_selected.csv`

In [9]:
list(horses_selected.columns)

['rid',
 'horseName',
 'age',
 'saddle',
 'decimalPrice',
 'isFav',
 'trainerName',
 'jockeyName',
 'position',
 'positionL',
 'dist',
 'weightSt',
 'weightLb',
 'overWeight',
 'outHandicap',
 'headGear',
 'RPR',
 'TR',
 'OR',
 'father',
 'mother',
 'gfather',
 'runners',
 'margin',
 'weight',
 'res_win',
 'res_place']

In [10]:
drop_columns = [
    'weightSt',    # can be calculated from weight
    'weightLb',    # can be calculated from weight
    'overWeight',  # can be calculated from weight
    'headGear',    # seems irrelevant, and don't have codes
    'runners',     # stored as part of race
    'margin',      # stored as part of race
]

In [11]:
horses_selected_trimmed = horses_selected_trimmed.drop(columns=drop_columns)

In [12]:
list(horses_selected_trimmed.columns)

['rid',
 'horseName',
 'age',
 'saddle',
 'decimalPrice',
 'isFav',
 'trainerName',
 'jockeyName',
 'position',
 'positionL',
 'dist',
 'outHandicap',
 'RPR',
 'TR',
 'OR',
 'father',
 'mother',
 'gfather',
 'weight',
 'res_win',
 'res_place']

In [13]:
horses_selected_trimmed.shape

(205138, 21)

---

## Trim `races_selected.csv`

In [14]:
list(races_selected_trimmed.columns)

['rid',
 'course',
 'time',
 'date',
 'title',
 'rclass',
 'band',
 'ages',
 'distance',
 'condition',
 'hurdles',
 'prizes',
 'winningTime',
 'prize',
 'metric',
 'countryCode',
 'ncond',
 'class',
 'runners',
 'margin',
 '1st_place_rank_in_odds',
 '2nd_place_rank_in_odds',
 '3rd_place_rank_in_odds',
 '1st_rank_in_odds_place',
 '2nd_rank_in_odds_place',
 '3rd_rank_in_odds_place',
 'placeAvailable',
 'showAvailable',
 'favoriteWon',
 'favoritePlaced',
 'favoriteShowed',
 'lat',
 'lng',
 'datetime',
 'station no',
 'station name',
 'station lat',
 'station lng',
 'dist to station',
 'station reading date',
 'temp',
 'msl',
 'rain',
 'rhum',
 'station reading timedelta']

In [15]:
drop_columns = [
    'time',       # datetime elsewhere
    'date',       # datetime elsewhere
    'rclass',     # easier to use class
    'band',       # irrelevant, quality of horses derived elsewhere
    'ages',       # age data attached to each horse individually
    'distance',   # easier to use metric
    'condition',  # easier to use ncond
    'hurdles',    # we don't consider races with hurdles
    'prizes',     # unecessary for our analysis
    'prize',      # can be calculated from prizes
    'countryCode',# we don't consider anything except for Ireland
]

In [16]:
races_selected_trimmed = races_selected_trimmed.drop(columns=drop_columns)

In [17]:
list(races_selected_trimmed.columns)

['rid',
 'course',
 'title',
 'winningTime',
 'metric',
 'ncond',
 'class',
 'runners',
 'margin',
 '1st_place_rank_in_odds',
 '2nd_place_rank_in_odds',
 '3rd_place_rank_in_odds',
 '1st_rank_in_odds_place',
 '2nd_rank_in_odds_place',
 '3rd_rank_in_odds_place',
 'placeAvailable',
 'showAvailable',
 'favoriteWon',
 'favoritePlaced',
 'favoriteShowed',
 'lat',
 'lng',
 'datetime',
 'station no',
 'station name',
 'station lat',
 'station lng',
 'dist to station',
 'station reading date',
 'temp',
 'msl',
 'rain',
 'rhum',
 'station reading timedelta']

In [18]:
races_selected_trimmed.shape

(20574, 34)

---

## Save Dataframes

In [19]:
horses_selected_trimmed.to_csv(f"{BASE_DIR}/data/streamline/horses_selected_trimmed.csv", index=False)

In [20]:
races_selected_trimmed.to_csv(f"{BASE_DIR}/data/streamline/races_selected_augment_with_weather_trimmed.csv", index=False)

---