# `remove_entries_w_bad_weather_data.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

Since we have decided to only keep entries if they match to a weather entry with a high amount of "goodness", we can drop any races that don't meet this criteria.

---

## Setup

In [1]:
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

In [7]:
import sys

sys.path.append(f'{BASE_DIR}/utils/')

from rid_to_weather import RID_TO_WEATHER

---

## Load `races_aticnmid.csv`

In [3]:
races_aticnmid = pd.read_csv(f"{BASE_DIR}/data/csv/races_aticnmid.csv", low_memory=False) 
races_aticnmid.head()

Unnamed: 0,rid,course,time,date,hurdles,prizes,winningTime,metric,countryCode,ncond,class
0,302858,Thurles (IRE),01:15,97/01/09,,[],277.2,3821.0,IE,1,0
1,291347,Punchestown (IRE),03:40,97/02/16,,[],447.2,5229.0,IE,5,0
2,377929,Leopardstown (IRE),03:00,97/05/11,,[],106.4,1609.0,IE,4,0
3,275117,Curragh (IRE),03:35,97/05/25,,[],125.9,2011.0,IE,4,0
4,66511,Leopardstown (IRE),04:30,97/06/02,,[],116.3,1810.0,IE,1,0


In [4]:
races_aticnmid.shape

(19510, 11)

---

## Load `horses_aticnmi.csv`

In [5]:
horses_aticnmi = pd.read_csv(f"{BASE_DIR}/data/csv/horses_aticnmi.csv", low_memory=False) 
horses_aticnmi.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,0.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


In [6]:
horses_aticnmi.shape

(197491, 18)

---

## Remove Entries w/ Bad Weather

In [8]:
rids = [k for k, v in RID_TO_WEATHER.items()]
rids[:5]

[302858, 291347, 377929, 275117, 66511]

In [9]:
len(rids)

19271

In [10]:
races_aticnmid = races_aticnmid[races_aticnmid['rid'].isin(rids)]
races_aticnmid.head()

Unnamed: 0,rid,course,time,date,hurdles,prizes,winningTime,metric,countryCode,ncond,class
0,302858,Thurles (IRE),01:15,97/01/09,,[],277.2,3821.0,IE,1,0
1,291347,Punchestown (IRE),03:40,97/02/16,,[],447.2,5229.0,IE,5,0
2,377929,Leopardstown (IRE),03:00,97/05/11,,[],106.4,1609.0,IE,4,0
3,275117,Curragh (IRE),03:35,97/05/25,,[],125.9,2011.0,IE,4,0
4,66511,Leopardstown (IRE),04:30,97/06/02,,[],116.3,1810.0,IE,1,0


In [11]:
assert len(races_aticnmid) == len(rids)

In [12]:
horses_aticnmi = horses_aticnmi[horses_aticnmi['rid'].isin(rids)]
horses_aticnmi.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,0.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


In [14]:
len(horses_aticnmi)

195052

In [13]:
assert set(races_aticnmid['rid']).symmetric_difference(set(horses_aticnmi['rid'])) == set()

---

## Save Dataframes

In [15]:
races_aticnmid.to_csv(f"{BASE_DIR}/data/csv/races_aticnmidg.csv", index=False)

In [16]:
horses_aticnmi.to_csv(f"{BASE_DIR}/data/csv/horses_aticnmig.csv", index=False)

---