# `create_horses_all.ipynb`

### Author: Anthony Hein

#### Last updated: 9/19/2021

# Overview:

Concatenates `horses_x.csv` files found at [https://www.kaggle.com/hwaitt/horse-racing](https://www.kaggle.com/hwaitt/horse-racing) into one file `horses_all.csv`.

---

## Setup

In [1]:
import git
import os
import glob
import pandas as pd
import numpy as np

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Find Files

In [3]:
horses_x_csvs = glob.glob(f"{BASE_DIR}/raw/csv/racing/horses_*.csv")
horses_x_csvs

['/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1997.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1996.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1994.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1995.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1991.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1990.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1992.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_1993.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_2018.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_2019.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_2009.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_2008.csv',
 '/Users/anthonyhein/Desktop/SML310/project/raw/csv/racing/horses_2020.csv',

In [4]:
len(horses_x_csvs)

31

---

## Extract Dataframes

In [5]:
dfs = [pd.read_csv(e, low_memory=False) for e in horses_x_csvs]
dfs[0].head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place
0,267255,Going For Broke,3.0,4.0,0.1,0,P C Haslam,Seb Sanders,1,,...,62.0,62.0,Simply Great,Empty Purse,Pennine Walk,6,1.168254,58,1.0,1
1,267255,Pinchincha,3.0,3.0,0.266667,0,Dave Morris,Tony Clark,2,4.0,...,56.0,65.0,Priolo,Western Heights,Shirley Heights,6,1.168254,60,0.0,1
2,267255,Skelton Sovereign,3.0,5.0,0.142857,0,Reg Hollinshead,D Griffiths,3,3.0,...,40.0,60.0,Contract Law,Mrs Lucky,Royal Match,6,1.168254,55,0.0,0
3,267255,Fast Spin,3.0,6.0,0.380952,1,David Barron,Tony Culhane,4,7.0,...,30.0,59.0,Formidable I,Topwinder,Topsider,6,1.168254,57,0.0,0
4,267255,As-Is,3.0,2.0,0.166667,0,Mark Johnston,J Weaver,5,7.0,...,21.0,65.0,Lomond,Capriati I,Diesis,6,1.168254,60,0.0,0


---

## Clean Dataframes

In [6]:
columns = dfs[0].columns
for i, df in enumerate(dfs):
    if len(df.columns) != len(columns) or not np.all(df.columns == columns):
        print(f"Failed at index {i}")
        print(set(df.columns).symmetric_difference(set(columns)))

Failed at index 12
{'price'}


In [7]:
dfs[12].head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,OR,father,mother,gfather,runners,margin,weight,res_win,res_place,price
0,10312,Waterproof,4.0,9.0,0.076923,0,Shaun Keightley,Brendan Powell,1,,...,,Pour Moi,Laughing Water,Duke Of Marmalade,9,1.199095,66,1.0,1.0,
1,10312,Eva's Diva,6.0,7.0,0.444444,1,Phil Middleton,Mr Sam Lee,2,15.0,...,110.0,Getaway,Shouette,Sadler's Wells,9,1.199095,65,0.0,1.0,
2,10312,Incredible Dream,7.0,5.0,0.019608,0,Conrad Allen,Joshua Moore,3,5.5,...,,Vale Of York,Finnmark,Halling,9,1.199095,72,0.0,1.0,
3,10312,Hats Off To Larry,6.0,4.0,0.230769,0,Mick Channon,Marc Goldstein,4,10.0,...,,Sixties Icon,Highland Jig,Norse Dancer,9,1.199095,72,0.0,0.0,
4,10312,Taqwaa,7.0,6.0,0.038462,0,Laura Morgan,Richie McLernon,5,5.5,...,,Iffraaj,Hallowed Park,Barathea,9,1.199095,72,0.0,0.0,


Seems like this dataset has extraneous columns. Let's remove those.

In [8]:
try:
    dfs[12] = dfs[12].drop(columns='price')
except KeyError:
    print("Already dropped this column.")
        
dfs[12].head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place
0,10312,Waterproof,4.0,9.0,0.076923,0,Shaun Keightley,Brendan Powell,1,,...,103.0,,Pour Moi,Laughing Water,Duke Of Marmalade,9,1.199095,66,1.0,1.0
1,10312,Eva's Diva,6.0,7.0,0.444444,1,Phil Middleton,Mr Sam Lee,2,15.0,...,89.0,110.0,Getaway,Shouette,Sadler's Wells,9,1.199095,65,0.0,1.0
2,10312,Incredible Dream,7.0,5.0,0.019608,0,Conrad Allen,Joshua Moore,3,5.5,...,88.0,,Vale Of York,Finnmark,Halling,9,1.199095,72,0.0,1.0
3,10312,Hats Off To Larry,6.0,4.0,0.230769,0,Mick Channon,Marc Goldstein,4,10.0,...,75.0,,Sixties Icon,Highland Jig,Norse Dancer,9,1.199095,72,0.0,0.0
4,10312,Taqwaa,7.0,6.0,0.038462,0,Laura Morgan,Richie McLernon,5,5.5,...,67.0,,Iffraaj,Hallowed Park,Barathea,9,1.199095,72,0.0,0.0


In [9]:
columns = dfs[0].columns
for i, df in enumerate(dfs):
    if len(df.columns) != len(columns) or not np.all(df.columns == columns):
        print(f"Failed at index {i}")
        print(set(df.columns).symmetric_difference(set(columns)))

---

## Concatenate Dataframes

In [10]:
horses_all = pd.concat(dfs)
horses_all.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place
0,267255,Going For Broke,3.0,4.0,0.1,0,P C Haslam,Seb Sanders,1,,...,62.0,62.0,Simply Great,Empty Purse,Pennine Walk,6,1.168254,58,1.0,1.0
1,267255,Pinchincha,3.0,3.0,0.266667,0,Dave Morris,Tony Clark,2,4.0,...,56.0,65.0,Priolo,Western Heights,Shirley Heights,6,1.168254,60,0.0,1.0
2,267255,Skelton Sovereign,3.0,5.0,0.142857,0,Reg Hollinshead,D Griffiths,3,3.0,...,40.0,60.0,Contract Law,Mrs Lucky,Royal Match,6,1.168254,55,0.0,0.0
3,267255,Fast Spin,3.0,6.0,0.380952,1,David Barron,Tony Culhane,4,7.0,...,30.0,59.0,Formidable I,Topwinder,Topsider,6,1.168254,57,0.0,0.0
4,267255,As-Is,3.0,2.0,0.166667,0,Mark Johnston,J Weaver,5,7.0,...,21.0,65.0,Lomond,Capriati I,Diesis,6,1.168254,60,0.0,0.0


In [11]:
horses_all.shape

(4107315, 27)

In [12]:
assert len(horses_all) == sum([len(df) for df in dfs])

---

## Save Dataframe

In [13]:
horses_all.to_csv(f"{BASE_DIR}/data/csv/horses_all.csv", index=False)