In [1]:
import pandas as pd
import numpy as np
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


import lightgbm as lgb
from sklearn.metrics import precision_score, classification_report, confusion_matrix

In [2]:
runs = pd.read_csv("../input/hkracing/runs.csv")
races = pd.read_csv('../input/hkracing/races.csv')

In [3]:
runs_data = runs[['race_id', 'won', 'horse_age', 'horse_country', 'horse_type', 'horse_rating',
       'horse_gear', 'declared_weight', 'actual_weight', 'draw', 'win_odds',
       'place_odds', 'horse_id']]

In [4]:
races_data = races[['race_id', 'venue', 'config', 'surface', 'distance', 'going', 'race_class', 'date']]

In [5]:
df = pd.merge(runs_data, races_data)
df.horse_country.isnull().value_counts(ascending=True)

True         2
False    79445
Name: horse_country, dtype: int64

In [6]:
df = df.dropna()

In [7]:
df.date = pd.to_datetime(df.date)
df.date.dtype

dtype('<M8[ns]')

In [8]:
start_time = min(df.date).strftime('%d %B %Y')
end_time = max(df.date).strftime('%d %B %Y')
no_of_horses = df.horse_id.nunique()
no_of_races = df.race_id.nunique()

print(f'The dataset was collected from {start_time} to {end_time}, which contains information about {no_of_horses} horses and {no_of_races} races. ')

The dataset was collected from 02 June 1997 to 05 June 2005, which contains information about 4280 horses and 6047 races. 


In [9]:
df = df.drop(columns=['horse_id', 'date'])

In [10]:
def horse_gear_impute(cols):
    if cols == '--':
        return 0
    else: 
        return 1

In [11]:
df.horse_gear = df.horse_gear.apply(horse_gear_impute)

In [12]:
df = pd.get_dummies(df, drop_first=True)

In [13]:
last_raceid = max(df.race_id)
last_race = df[df.race_id == last_raceid]

In [14]:
new_data = df[:75696]   # drop the last race data for modeling
new_data = new_data.drop(columns='race_id')   # drop the unnecessary race_id column

In [15]:
X = new_data.drop(columns='won')
y = new_data['won']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [17]:
k_range = range(1,10)
scores = {}
scores_list = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    # precision ratio: tp / (tp + fp), aiming at minimize fp (predict: win, actual: lose)
    scores[k] = precision_score(y_test, y_pred)
    scores_list.append(precision_score(y_test, y_pred))

In [18]:
import operator
max(scores.items(), key=operator.itemgetter(1))

(8, 0.3170731707317073)

In [19]:
mnb = MultinomialNB()
scores = cross_val_score(mnb, X_train, y_train, cv=10, scoring='precision')
average_precision = sum(scores) / len(scores) 
print(f'MultinomialNB average precision: {average_precision}')

MultinomialNB average precision: 0.13422945509692777


In [20]:
lr= linear_model.LinearRegression()
cv= cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.03874367 0.04104367 0.03780946 0.03961284 0.03484063]
0.03841005467497516


In [21]:
knn= KNeighborsRegressor(n_neighbors=4)
cv= cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[-0.13502545 -0.11486084 -0.14137424 -0.12304418 -0.1556116 ]
-0.1339832618991806


In [22]:
tree=DecisionTreeRegressor(random_state=1)
cv= cross_val_score(tree,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[-0.91554053 -0.89854578 -1.06022677 -0.987343   -1.02642104]
-0.9776154234326991


In [23]:
rf=RandomForestRegressor(random_state=1)
cv= cross_val_score(rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.04618942 0.04726448 0.05227916 0.05094625 0.03073855]
0.045483571675523596


In [24]:
lr.fit(X_train,y_train)
y_lr=lr.predict(X_test)
#"---------------------------------------------------------------------------------------------------")
rf.fit(X_train,y_train)
y_rf=rf.predict(X_test)

In [25]:
print("Linear Regression results:")
print("R^2",metrics.r2_score(y_test,y_lr))
print("Mean Absolute Error", metrics.mean_absolute_error(y_test,y_lr))
print("Mean Squared Error", metrics.mean_squared_error(y_test,y_lr))
print("Root Mean Squared Error",np.square(metrics.mean_squared_error(y_test,y_lr)))

print("---------------------------------------------------------------------------------------------------")

print("Random Forest results:")
print("R^2",metrics.r2_score(y_test,y_rf))
print("Mean Absolute Error", metrics.mean_absolute_error(y_test,y_rf))
print("Mean Squared Error", metrics.mean_squared_error(y_test,y_rf))
print("Root Mean Squared Error",np.square(metrics.mean_squared_error(y_test,y_rf)))

Linear Regression results:
R^2 0.03948219722399671
Mean Absolute Error 0.1501344972753171
Mean Squared Error 0.07126917089106909
Root Mean Squared Error 0.00507929471950041
---------------------------------------------------------------------------------------------------
Random Forest results:
R^2 0.0522783989177672
Mean Absolute Error 0.1439550858652576
Mean Squared Error 0.07031970937912813
Root Mean Squared Error 0.00494486152716504
