# `create_races_all.ipynb`

### Author: Anthony Hein

#### Last updated: 9/19/2021

# Overview:

Concatenates `races_x.csv` files found at [https://www.kaggle.com/hwaitt/horse-racing](https://www.kaggle.com/hwaitt/horse-racing) into one file `races_all.csv`.

---

## Setup

In [1]:
import git
import os
import glob
import pandas as pd
import numpy as np

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Find Files

In [3]:
races_x_csvs = glob.glob(f"{BASE_DIR}/raw/csv/racing/races_*.csv")
races_x_csvs

['/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1997.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1996.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1994.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1995.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1991.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1990.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1992.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_1993.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_2018.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_2019.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_2009.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_2020.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/races_2008.csv',
 '/Users/ant

In [4]:
len(races_x_csvs)

31

---

## Extract Dataframes

In [5]:
dfs = [pd.read_csv(e, low_memory=False) for e in races_x_csvs]
dfs[0].head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,hurdles,prizes,winningTime,prize,metric,countryCode,ncond,class
0,267255,Southwell (AW),03:40,97/01/01,New Year Handicap Class E,Class 5,0-70,3yo,1m,Standard,,"[2752.25, 833.0, 406.5, 193.25]",106.9,4184.0,1609.0,GB,0,5
1,297570,Southwell (AW),12:35,97/01/01,Resolution Claiming Stakes Class F (Div I),Class 6,,4yo+,7f,Standard,,"[1944.0, 544.0, 264.0]",91.0,2752.0,1407.0,GB,0,6
2,334421,Southwell (AW),01:05,97/01/01,One Too Many Median Auction Maiden Apprentices...,Class 6,,4-6yo,1m3f,Standard,,"[2502.0, 702.0, 342.0]",150.7,3546.0,2212.0,GB,0,6
3,366304,Southwell (AW),03:10,97/01/01,Morning Call Selling Stakes Class G Southwell ...,Class 6,,3yo,1m,Standard,,"[2189.0, 614.0, 299.0]",108.6,3102.0,1609.0,GB,0,6
4,13063,Southwell (AW),02:40,97/01/01,Thinking &amp; Drinking Handicap Class E,Class 5,0-70,4yo+,2m½f,Standard,,"[2726.25, 825.0, 402.5, 191.25]",231.4,4144.0,3318.5,GB,0,5


---

## Clean Dataframes

In [6]:
columns = dfs[0].columns
for i, df in enumerate(dfs):
    if len(df.columns) != len(columns) or not np.all(df.columns == columns):
        print(f"Failed at index {i}")
        print(set(df.columns).symmetric_difference(set(columns)))

Failed at index 11
{'currency'}


In [7]:
dfs[11].head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,hurdles,prizes,winningTime,prize,metric,countryCode,ncond,class,currency
0,10312,Fakenham,02:55,20/01/01,Happy New Year Maiden Hurdle (Div I),Class 4,,4yo+,2m,Good To Soft,9 hurdles,"[5198.4, 1526.4, 763.2, 381.6]",253.88,7869,3218.0,GB,10,4,
1,10896,Cheltenham,03:50,20/01/01,EBF Stallions &amp; Cheltenham Pony Club (A St...,Class 1,,4yo,1m6f,Soft,,"[14237.5, 5342.5, 2675.0, 1332.5, 670.0, 335.0]",206.55,24592,2815.0,GB,5,1,
2,23038,Tramore (IRE),02:55,20/01/01,Jerry O'Donovan Memorial Rated Novice Chase,,,5yo+,2m,Soft,12 fences,"[7387.5, 2387.5, 1137.5, 512.5, 262.5, 137.5]",266.4,11826,3218.0,IE,5,0,
3,23986,Fairyhouse (IRE),02:40,20/01/01,Follow Fairyhouse On Social Media Beginners Chase,,,5yo+,2m5f,Yielding,13 fences,"[8274.0, 2674.0, 1274.0, 574.0, 294.0, 154.0]",340.9,13244,4223.0,IE,6,0,
4,25123,Fairyhouse (IRE),02:05,20/01/01,Fairyhouse Launches New Brand In 2020 Handicap...,,80-109,4yo+,3m,Yielding,13 hurdles,"[7092.0, 2292.0, 1092.0, 492.0, 252.0, 132.0]",389.0,11352,4827.0,IE,6,0,


Seems like this dataset has extraneous columns. Let's remove those.

In [8]:
try:
    dfs[11] = dfs[11].drop(columns='currency')
except KeyError:
    print("Already dropped this column.")
        
dfs[11].head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,hurdles,prizes,winningTime,prize,metric,countryCode,ncond,class
0,10312,Fakenham,02:55,20/01/01,Happy New Year Maiden Hurdle (Div I),Class 4,,4yo+,2m,Good To Soft,9 hurdles,"[5198.4, 1526.4, 763.2, 381.6]",253.88,7869,3218.0,GB,10,4
1,10896,Cheltenham,03:50,20/01/01,EBF Stallions &amp; Cheltenham Pony Club (A St...,Class 1,,4yo,1m6f,Soft,,"[14237.5, 5342.5, 2675.0, 1332.5, 670.0, 335.0]",206.55,24592,2815.0,GB,5,1
2,23038,Tramore (IRE),02:55,20/01/01,Jerry O'Donovan Memorial Rated Novice Chase,,,5yo+,2m,Soft,12 fences,"[7387.5, 2387.5, 1137.5, 512.5, 262.5, 137.5]",266.4,11826,3218.0,IE,5,0
3,23986,Fairyhouse (IRE),02:40,20/01/01,Follow Fairyhouse On Social Media Beginners Chase,,,5yo+,2m5f,Yielding,13 fences,"[8274.0, 2674.0, 1274.0, 574.0, 294.0, 154.0]",340.9,13244,4223.0,IE,6,0
4,25123,Fairyhouse (IRE),02:05,20/01/01,Fairyhouse Launches New Brand In 2020 Handicap...,,80-109,4yo+,3m,Yielding,13 hurdles,"[7092.0, 2292.0, 1092.0, 492.0, 252.0, 132.0]",389.0,11352,4827.0,IE,6,0


In [9]:
columns = dfs[0].columns
for i, df in enumerate(dfs):
    if len(df.columns) != len(columns) or not np.all(df.columns == columns):
        print(f"Failed at index {i}")
        print(set(df.columns).symmetric_difference(set(columns)))

---

## Concatenate Dataframes

In [10]:
races_all = pd.concat(dfs)
races_all.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,hurdles,prizes,winningTime,prize,metric,countryCode,ncond,class
0,267255,Southwell (AW),03:40,97/01/01,New Year Handicap Class E,Class 5,0-70,3yo,1m,Standard,,"[2752.25, 833.0, 406.5, 193.25]",106.9,4184.0,1609.0,GB,0,5
1,297570,Southwell (AW),12:35,97/01/01,Resolution Claiming Stakes Class F (Div I),Class 6,,4yo+,7f,Standard,,"[1944.0, 544.0, 264.0]",91.0,2752.0,1407.0,GB,0,6
2,334421,Southwell (AW),01:05,97/01/01,One Too Many Median Auction Maiden Apprentices...,Class 6,,4-6yo,1m3f,Standard,,"[2502.0, 702.0, 342.0]",150.7,3546.0,2212.0,GB,0,6
3,366304,Southwell (AW),03:10,97/01/01,Morning Call Selling Stakes Class G Southwell ...,Class 6,,3yo,1m,Standard,,"[2189.0, 614.0, 299.0]",108.6,3102.0,1609.0,GB,0,6
4,13063,Southwell (AW),02:40,97/01/01,Thinking &amp; Drinking Handicap Class E,Class 5,0-70,4yo+,2m½f,Standard,,"[2726.25, 825.0, 402.5, 191.25]",231.4,4144.0,3318.5,GB,0,5


In [11]:
races_all.shape

(396572, 18)

In [12]:
assert len(races_all) == sum([len(df) for df in dfs])

---

## Save Dataframe

In [13]:
races_all.to_csv(f"{BASE_DIR}/data/csv/races_all.csv", index=False)