# `get_horses_races_intersection.ipynb`

### Author: Anthony Hein

#### Last updated: 10/19/2021

# Overview:

We must again synchronize the horses and races dataset since we have dropped some entries asymmetrically.

---

## Setup

In [1]:
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Load `horses_aticnmig.csv`

In [3]:
horses_aticnmig = pd.read_csv(f"{BASE_DIR}/data/csv/horses_aticnmig.csv", low_memory=False) 
horses_aticnmig.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,trainerName,jockeyName,position,positionL,dist,outHandicap,RPR,TR,OR,father,mother,gfather,weight
0,302858,Kings Return,6.0,4.0,0.6,W P Mullins,D J Casey,1,,0.0,0.0,102.0,51.591987,79.654604,King's Ride,Browne's Return,Deep Run,73
1,302858,Majestic Red I,6.0,5.0,0.047619,John Hackett,Conor O'Dwyer,2,8,0.0,0.0,94.0,51.591987,79.654604,Long Pond,Courtlough Lady,Giolla Mear,73
2,302858,Clearly Canadian,6.0,2.0,0.166667,D T Hughes,G Cotter,3,1.5,9.5,0.0,92.0,51.591987,79.654604,Nordico,Over The Seas,North Summit,71
3,302858,Bernestic Wonder,8.0,1.0,0.058824,E McNamara,J Old Jones,4,dist,39.5,0.0,71.87665,51.591987,79.654604,Roselier,Miss Reindeer,Reindeer,73
4,302858,Beauty's Pride,5.0,6.0,0.038462,J J Lennon,T Martin,5,dist,69.5,0.0,71.87665,51.591987,79.654604,Noalto,Elena's Beauty,Tarqogan,66


In [4]:
horses_aticnmig.shape

(195052, 18)

---

## Load `races_featurized.csv`

In [5]:
races_featurized = pd.read_csv(f"{BASE_DIR}/data/csv/races_featurized.csv", low_memory=False) 
races_featurized.head()

Unnamed: 0,rid,metric,margin,temp,msl,rain,rhum,course__Ballinrobe,course__Bellewstown,course__Clonmel,...,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,year
0,302858,3821.0,1.219263,2.2,1012.7,0.0,82,0,0,0,...,0,0,0,0,0,0,0,0,0,1997
1,291347,5229.0,1.218049,8.1,992.8,0.0,79,0,0,0,...,0,0,0,0,0,0,0,0,0,1997
2,377929,1609.0,1.204927,10.1,996.7,0.0,76,0,0,0,...,0,1,0,0,0,0,0,0,0,1997
3,275117,2011.0,1.083838,15.8,1030.1,0.0,53,0,0,0,...,0,1,0,0,0,0,0,0,0,1997
4,66511,1810.0,1.077871,16.3,1022.9,0.0,53,0,0,0,...,0,0,1,0,0,0,0,0,0,1997


In [6]:
races_featurized.shape

(19248, 70)

---

## Delete Races w/o Horse Information 

In [7]:
bad_rids = set(races_featurized['rid']) - set(horses_aticnmig['rid'])
len(bad_rids)

0

---

## Delete Horses w/o Race Information

In [8]:
bad_rids = set(horses_aticnmig['rid']) - set(races_featurized['rid'])
len(bad_rids)

23

In [9]:
horses_augment = horses_aticnmig[~ horses_aticnmig['rid'].isin(bad_rids)]
len(horses_augment)

194898

---

## Sanity Check

In [10]:
assert set(horses_augment['rid']).symmetric_difference(set(races_featurized['rid'])) == set()

---

## Save Dataframes

In [11]:
# no change
# races_featurized.to_csv(f"{BASE_DIR}/data/csv/races_featurized.csv", index=False)

In [12]:
horses_augment.to_csv(f"{BASE_DIR}/data/csv/horses_augment.csv", index=False)

---