In [None]:
#
# Brian Heaphy - 14160846
#
# Dataset: https://www.kaggle.com/lantanacamara/hong-kong-horse-racing#race-result-horse.csv
#

In [10]:
# Linear algebra
import numpy as np 

# Data processing
import pandas as pd 

# Data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

In [2]:
# Read in dataset
df = pd.read_csv("/home/brian/cs4055/race-result-horse.csv")

In [6]:
# Here we can want to focus on 'win_odds'. 
# We can see the mean is 30/1 with the lowest odds being 1/1 and the highest 99/1)
df.describe()

Unnamed: 0,horse_number,draw,running_position_1,running_position_2,running_position_3,running_position_4,win_odds
count,29851.0,29598.0,29574.0,29560.0,29542.0,16618.0,29598.0
mean,6.885397,6.862051,6.833942,6.831326,6.826992,6.942472,30.255274
std,3.751176,3.742622,3.727267,3.725933,3.724886,3.79808,31.623839
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,4.0,4.0,4.0,4.0,4.0,4.0,7.7
50%,7.0,7.0,7.0,7.0,7.0,7.0,16.0
75%,10.0,10.0,10.0,10.0,10.0,10.0,41.0
max,14.0,15.0,14.0,14.0,14.0,14.0,99.0


In [7]:
# Here we can see the current data types of each field.
# There are 30189 records in the dataset along with 17 features.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30189 entries, 0 to 30188
Data columns (total 17 columns):
finishing_position       30187 non-null object
horse_number             29851 non-null float64
horse_name               30189 non-null object
horse_id                 30189 non-null object
jockey                   30160 non-null object
trainer                  30189 non-null object
actual_weight            30189 non-null object
declared_horse_weight    30189 non-null object
draw                     29598 non-null float64
length_behind_winner     29598 non-null object
running_position_1       29574 non-null float64
running_position_2       29560 non-null float64
running_position_3       29542 non-null float64
running_position_4       16618 non-null float64
finish_time              29520 non-null object
win_odds                 29598 non-null float64
race_id                  30189 non-null object
dtypes: float64(7), object(10)
memory usage: 3.9+ MB


In [4]:
# From the first 10 rows we can see that we will have to convert a lot of fields to numeric ones.
df.head(10)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,running_position_1,running_position_2,running_position_3,running_position_4,finish_time,win_odds,race_id
0,1,1.0,DOUBLE DRAGON,K019,B Prebble,D Cruz,133,1032,1.0,-,1.0,2.0,2.0,1.0,1.22.33,3.8,2014-001
1,2,2.0,PLAIN BLUE BANNER,S070,D Whyte,D E Ferraris,133,1075,13.0,2,8.0,9.0,9.0,2.0,1.22.65,8.0,2014-001
2,3,10.0,GOLDWEAVER,P072,Y T Cheng,Y S Tsui,121,1065,3.0,2,2.0,1.0,1.0,3.0,1.22.66,5.7,2014-001
3,4,3.0,SUPREME PROFIT,P230,J Moreira,C S Shum,132,1222,2.0,2,6.0,4.0,5.0,4.0,1.22.66,6.1,2014-001
4,5,7.0,THE ONLY KID,H173,Z Purton,K W Lui,125,1136,9.0,4-1/4,9.0,10.0,10.0,5.0,1.23.02,6.1,2014-001
5,6,9.0,WINNING ADVANTAGE,N359,A Suborics,A T Millard,123,1100,11.0,5-1/2,12.0,13.0,13.0,6.0,1.23.20,24.0,2014-001
6,7,13.0,CARE FREE ELEGANCE,P340,C Y Ho,K L Man,115,1053,12.0,5-1/2,4.0,3.0,3.0,7.0,1.23.22,99.0,2014-001
7,8,4.0,COOL PAL,S035,H W Lai,L Ho,129,1203,8.0,5-3/4,5.0,6.0,6.0,8.0,1.23.25,21.0,2014-001
8,9,6.0,TAI PO FORTUNE,P081,K Teetan,T P Yung,127,1073,6.0,6-1/4,7.0,7.0,7.0,9.0,1.23.33,10.0,2014-001
9,10,11.0,SUPER HORSE,L308,T H So,C W Chang,119,1137,7.0,6-3/4,11.0,11.0,12.0,10.0,1.23.41,27.0,2014-001


In [8]:
 # Here we can see what data is missing from each field.  
df.apply(lambda x: sum(x.isnull()),axis=0)

finishing_position           2
horse_number               338
horse_name                   0
horse_id                     0
jockey                      29
trainer                      0
actual_weight                0
declared_horse_weight        0
draw                       591
length_behind_winner       591
running_position_1         615
running_position_2         629
running_position_3         647
running_position_4       13571
finish_time                669
win_odds                   591
race_id                      0
dtype: int64

In [9]:
# The features of the dataset can be seen below. Our target feature will be finishing position.
# What features could contribute to the finishing position of the horse?
# running_position_1-4 will show us what position the horse was in during 4 seperate points of the race.
# win_odds will show us what odds the bookmakers gave the horse of winning before the race.
# trainer will show us who trained the horses. Maybe some trainers do a better job than others and produce more winners.
# jockey will show us who rode the horse. Maybe some jockeys get better horses more frequently.
# 
df.columns.values

array(['finishing_position', 'horse_number', 'horse_name', 'horse_id',
       'jockey', 'trainer', 'actual_weight', 'declared_horse_weight',
       'draw', 'length_behind_winner', 'running_position_1',
       'running_position_2', 'running_position_3', 'running_position_4',
       'finish_time', 'win_odds', 'race_id'], dtype=object)

In [11]:
# Splitting the data into the training set and test set. The test set is 10% of the original
df_train, df_test = train_test_split(df, test_size=0.1)