In [24]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import time

In [25]:
# data preparation
data = pd.read_csv('data/training.csv')
m_data = data.shape[0]
df_train, df_test = data[0:int(0.7*m_data)], data[int(0.7*m_data)+1:m_data]
df_test = df_test.reset_index(drop=True)

df = pd.read_csv('data/race-result-horse.csv')
m, n = np.shape(df)
m_train, _ = np.shape(df_train)
m_test, _ = np.shape(df_test)

#produce dicts for horse,jockey,trainer
horse=list()
jockey=list()
trainer=list()
for i in range(len(df)):
    if df.horse_name[i] not in horse:
        horse.append(df.horse_name[i])
    if df.jockey[i] not in jockey:
        jockey.append(df.jockey[i])
    if df.trainer[i] not in jockey:
        trainer.append(df.trainer[i])
train_horse=np.zeros((m_train,1))
train_jockey=np.zeros((m_train,1))
train_trainer=np.zeros((m_train,1))

In [26]:
# prepare training data
for i in range(m_train):
    train_horse[i,0] = horse.index(df_train.horse_name[i])
    train_jockey[i,0] = jockey.index(df_train.jockey[i])
    train_trainer[i,0] = trainer.index(df_train.trainer[i])
actual_weight = df_train.actual_weight.reshape((m_train,1))
declared_weight = df_train.declared_horse_weight.reshape((m_train,1))
draw = df_train.draw.reshape((m_train,1))
win_odds = df_train.win_odds.reshape((m_train,1))
race_distance = df_train.race_distance.reshape((m_train,1))

# we use horse, jockey, trainer, actual weight, declared weight, win odds, race distance as independent variables
X_train = np.hstack((train_horse, train_jockey, train_trainer, actual_weight,
                     declared_weight, draw, win_odds, race_distance)) 
y_train = df_train.finishing_position

# prepare test data
test_horse=np.zeros((m_test,1))
test_jockey=np.zeros((m_test,1))
test_trainer=np.zeros((m_test,1))

#print(df_test.horse_name[])
for i in range(len(df_test)):
    test_horse[i,0] = horse.index(df_test.horse_name[i])
    test_jockey[i,0] = jockey.index(df_test.jockey[i])
    test_trainer[i,0] = trainer.index(df_test.trainer[i])
actual_weight_test = df_test.actual_weight.reshape((m_test,1))
declared_weight_test = df_test.declared_horse_weight.reshape((m_test,1))
draw_test = df_test.draw.reshape((m_test,1))
win_odds_test = df_test.win_odds.reshape((m_test,1))
race_distance_test = df_test.race_distance.reshape((m_test,1))
X_test = np.hstack((test_horse,test_jockey,test_trainer,actual_weight_test,
                    declared_weight_test, draw_test, win_odds_test, race_distance_test))
y_test = df_test.finishing_position

  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [43]:
#3.1.1 Logistic
start_time = time.time()
lr_model = LogisticRegression(C=0.1, random_state=0)
lr_model.fit(X_train,y_train)
lr_result = lr_model.predict(X_test)
print('training time: ', time.time()-start_time, 'secends')
# print(lr_result.shape)
lr_score = lr_model.score(X_test,y_test)
print(lr_score)


training time:  0.8329892158508301 secends
0.13381580814531005


In [41]:
print('start')
parameters = {'penalty': ['l1', 'l2'], 'C': [0.06, 0.08, 0.1, 1], 'random_state': [0]}
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters)
clf.fit(X_train, y_train)
print('finish')
sorted(clf.cv_results_.keys())

start
finish


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_C',
 'param_penalty',
 'param_random_state',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [44]:
# print(clf.cv_results_)
print(clf.best_estimator_)
print(clf.best_params_)
lr_new = clf.best_estimator_
lr_new.fit(X_train,y_train)
print(lr_new.score(X_test, y_test))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
{'C': 1, 'penalty': 'l1', 'random_state': 0}
0.13126152972896268


In [None]:
# 3.1.2 Naïve Bayes
start_time = time.time()
nb_model = sklearn.naive_bayes.GaussianNB()
nb_model.fit(X_train, y_train)
nb_result=nb_model.predict(X_test)
print('training time: ', time.time()-start_time, 'secends')
nb_score = nb_model.score(X_test, y_test)
print(nb_score)


In [29]:
# 3.1.3 SVM
# first normalize the data


from sklearn import svm
svm_model = svm.SVC(kernel='rbf',random_state=0,gamma=0.005,C=0.7)
#print('model')
#svm_model.fit(X_train, y_train)
#print('fit')
#svm_score = svm_model.score(X_test, y_test)
#print(svm_score)
start_time = time.time()
#print('model')
svm_model.fit(X_train, y_train)
#print('fit')
svm_result=svm_model.predict(X_test)
print('training time: ', time.time()-start_time, 'secends')
svm_score = svm_model.score(X_test, y_test)
#print(svm_model.coef_)
#print(svm_model.predict(X_test))
print(svm_score)


training time:  21.498592615127563 secends
0.08485880516531857


In [63]:
parameters = {'kernel':['rbf', 'poly'], 'C':[0.1, 0.5, 1], 'gamma': ['auto', 0.01, 0.1], 
              'random_state': [0], 'max_iter': [100, 500, 1000]}
svc = svm.SVC()
print('start trainin')
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print('finish')
sorted(clf.cv_results_.keys())

start trainin












finish




['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_C',
 'param_gamma',
 'param_kernel',
 'param_max_iter',
 'param_random_state',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [64]:
print(clf.best_params_)
svm_new = clf.best_estimator_
svm_new.fit(X_train,y_train)
print(svm_new.score(X_test, y_test))

{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf', 'max_iter': 500, 'random_state': 0}




0.0842911877394636


In [66]:
print('start')
parameters = {'n_estimators': [30, 50, 100], 'max_features': ['sqrt', 'log2', None], 
              'max_depth': [None,1, 2, 5], 'random_state': [0]}
raf = RandomForestClassifier()
clf = GridSearchCV(raf, parameters)
clf.fit(X_train, y_train)
print('finish')
sorted(clf.cv_results_.keys())


start
finish


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_max_depth',
 'param_max_features',
 'param_n_estimators',
 'param_random_state',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [69]:
print(clf.best_params_)
raf_new = clf.best_estimator_
raf_new.fit(X_train,y_train)
print(raf_new.score(X_test, y_test))
print(clf.best_estimator_)
print(raf_new.feature_importances_)

{'max_depth': 2, 'max_features': None, 'n_estimators': 50, 'random_state': 0}
0.1326805732936001
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
[0. 0. 0. 0. 0. 0. 1. 0.]


In [57]:
#3.1.4 Random Forest

start_time = time.time()
rf_model=RandomForestClassifier(max_depth=2,random_state=0)

rf_model.fit(X_train,y_train)
rf_result=rf_model.predict(X_test)
print('training time: ', time.time()-start_time, 'secends')
#print(rf_model.predict(X_test))
print(rf_model.score(X_test,y_test))

training time:  0.07471179962158203 secends
0.12430821626223926


In [31]:
#3.2
from collections import Counter
#The lr matrix
horse_win_lr=np.zeros((len(lr_result),1))
horse_top3_lr=np.zeros((len(lr_result),1))
horse_top50percent_lr=np.zeros((len(lr_result),1))
count_of_race_participation=Counter(df_test.race_id)
for i in range(len(lr_result)):
    if lr_result[i] == 1:
         horse_win_lr[i]=1
    if lr_result[i] <= 3:
         horse_top3_lr[i] = 1
    if lr_result[i]<= np.floor(count_of_race_participation[df_test.race_id[i]]/2):
         horse_top50percent_lr[i] = 1
headers=['RaceID','HorseID','HorseWin','HorseRankTop3','HorseRankTop50Percent']
import csv
with open('lr_predictions.csv','w') as f1:
     lr_csv=csv.writer(f1)
     lr_csv.writerow(headers)
     for i in range(len(lr_result)):
         lr_csv.writerow([df_test.race_id[i],df_test.horse_id[i],horse_win_lr[i][0],horse_top3_lr[i][0],horse_top50percent_lr[i][0]])
#the naive bayesian matrix
horse_win_nb=np.zeros((len(nb_result),1))
horse_top3_nb=np.zeros((len(nb_result),1))
horse_top50percent_nb=np.zeros((len(nb_result),1))
count_of_race_participation=Counter(df_test.race_id)
for i in range(len(nb_result)):
    if nb_result[i] == 1:
        horse_win_nb[i]=1
    if nb_result[i] <= 3:
        horse_top3_nb[i] = 1
    if nb_result[i]<= np.floor(count_of_race_participation[df_test.race_id[i]]/2):
        horse_top50percent_nb[i] = 1
with open('nb_predictions.csv','w') as f2:
     nb_csv=csv.writer(f2)
     nb_csv.writerow(headers)
     for i in range(len(nb_result)):
         nb_csv.writerow([df_test.race_id[i],df_test.horse_id[i],horse_win_nb[i][0],horse_top3_nb[i][0],horse_top50percent_nb[i][0]])
#the svm matrix
horse_win_svm=np.zeros((len(svm_result),1))
horse_top3_svm=np.zeros((len(svm_result),1))
horse_top50percent_svm=np.zeros((len(svm_result),1))
count_of_race_participation=Counter(df_test.race_id)
for i in range(len(svm_result)):
    if svm_result[i] == 1:
        horse_win_svm[i]=1
    if svm_result[i] <= 3:
        horse_top3_svm[i] = 1
    if svm_result[i]<= np.floor(count_of_race_participation[df_test.race_id[i]]/2):
        horse_top50percent_svm[i] = 1
with open('svm_predictions.csv','w') as f3:
    svm_csv=csv.writer(f3)
    svm_csv.writerow(headers)
    for i in range(len(svm_result)):
        svm_csv.writerow([df_test.race_id[i],df_test.horse_id[i],horse_win_svm[i][0],horse_top3_svm[i][0],horse_top50percent_svm[i][0]])
#the rf matrix
horse_win_rf=np.zeros((len(rf_result),1))
horse_top3_rf=np.zeros((len(rf_result),1))
horse_top50percent_rf=np.zeros((len(rf_result),1))
count_of_race_participation=Counter(df_test.race_id)
for i in range(len(rf_result)):
    if rf_result[i] == 1:
        horse_win_rf[i]=1
    if rf_result[i] <= 3:
        horse_top3_rf[i] = 1
    if rf_result[i]<= np.floor(count_of_race_participation[df_test.race_id[i]]/2):
        horse_top50percent_rf[i] = 1
with open('rf_predictions.csv','w') as f4:
    rf_csv=csv.writer(f4)
    rf_csv.writerow(headers)
    for i in range(len(rf_result)):
        rf_csv.writerow([df_test.race_id[i],df_test.horse_id[i],horse_win_rf[i][0],horse_top3_rf[i][0],horse_top50percent_rf[i][0]])

In [32]:
#3.3
#get actual results from y_test
horse_win_actual=np.zeros((len(y_test),1))
horse_top3_actual=np.zeros((len(y_test),1))
horse_top50percent_actual=np.zeros((len(y_test),1))
count_of_race_participation=Counter(df_test.race_id)
for i in range(len(y_test)):
    if y_test[i]==1:
         horse_win_actual[i]=1
    if y_test[i]<=3:
         horse_top3_actual[i]=1
    if y_test[i]<=np.floor(count_of_race_participation[df_test.race_id[i]]/2):
         horse_top50percent_actual[i]=1
#lr model
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr horse_win
    if (horse_win_lr[i]==1 and horse_win_actual[i]==1):
        TP=TP+1
    if (horse_win_lr[i]==1 and horse_win_actual[i]==0):
        FP=FP+1
    if (horse_win_lr[i]==0 and horse_win_actual[i]==1):
        FN=FN+1
    if (horse_win_lr[i]==0 and horse_win_actual[i]==0):
        TN=TN+1
print("Recall of logistic model horse_win prediction= ", TP/(TP+FN))
print("Precision of logistic model horse_win prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top3
    if (horse_top3_lr[i]==1 and horse_top3_actual[i]==1):
        TP=TP+1
    if (horse_top3_lr[i]==1 and horse_top3_actual[i]==0):
        FP=FP+1
    if (horse_top3_lr[i]==0 and horse_top3_actual[i]==1):
        FN=FN+1
    if (horse_top3_lr[i]==0 and horse_top3_actual[i]==0):
        TN=TN+1
print("Recall of logistic model horse_top3 prediction= ", TP/(TP+FN))
print("Precision of logistic model horse_top3 prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top50percent
    if (horse_top50percent_lr[i]==1 and horse_top50percent_actual[i]==1):
        TP=TP+1
    if (horse_top50percent_lr[i]==1 and horse_top50percent_actual[i]==0):
        FP=FP+1
    if (horse_top50percent_lr[i]==0 and horse_top50percent_actual[i]==1):
        FN=FN+1
    if (horse_top50percent_lr[i]==0 and horse_top50percent_actual[i]==0):
        TN=TN+1
print("Recall of logistic model horse_top50percent prediction= ", TP/(TP+FN))
print("Precision of logistic model horse_top50percent prediction= ", TP/(TP+FP))
#naive bayesian
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#nb horse_win
    if (horse_win_nb[i]==1 and horse_win_actual[i]==1):
        TP=TP+1
    if (horse_win_nb[i]==1 and horse_win_actual[i]==0):
        FP=FP+1
    if (horse_win_nb[i]==0 and horse_win_actual[i]==1):
        FN=FN+1
    if (horse_win_nb[i]==0 and horse_win_actual[i]==0):
        TN=TN+1
print("Recall of naive bayesian model horse_win prediction= ", TP/(TP+FN))
print("Precision of naive bayesian model horse_win prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top3
    if (horse_top3_nb[i]==1 and horse_top3_actual[i]==1):
        TP=TP+1
    if (horse_top3_nb[i]==1 and horse_top3_actual[i]==0):
        FP=FP+1
    if (horse_top3_nb[i]==0 and horse_top3_actual[i]==1):
        FN=FN+1
    if (horse_top3_nb[i]==0 and horse_top3_actual[i]==0):
        TN=TN+1
print("Recall of naive bayesian model horse_top3 prediction= ", TP/(TP+FN))
print("Precision of naive bayesian model horse_top3 prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top50percent
    if (horse_top50percent_nb[i]==1 and horse_top50percent_actual[i]==1):
        TP=TP+1
    if (horse_top50percent_nb[i]==1 and horse_top50percent_actual[i]==0):
        FP=FP+1
    if (horse_top50percent_nb[i]==0 and horse_top50percent_actual[i]==1):
        FN=FN+1
    if (horse_top50percent_nb[i]==0 and horse_top50percent_actual[i]==0):
        TN=TN+1
print("Recall of naive bayesian model horse_top50percent prediction= ", TP/(TP+FN))
print("Precision of naive bayesian model horse_top50percent prediction= ", TP/(TP+FP))
#svm
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#nb horse_win
    if (horse_win_svm[i]==1 and horse_win_actual[i]==1):
        TP=TP+1
    if (horse_win_svm[i]==1 and horse_win_actual[i]==0):
        FP=FP+1
    if (horse_win_svm[i]==0 and horse_win_actual[i]==1):
        FN=FN+1
    if (horse_win_svm[i]==0 and horse_win_actual[i]==0):
        TN=TN+1
if ((TP+FN)==0 or (TP+FP)==0):
    print("Recall of svm model horse_win prediction and Precision of svm model horse_win prediction= 0")
else:
    print("Recall of svm model horse_win prediction= ", TP/(TP+FN))
    print("Precision of svm model horse_win prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top3
    if (horse_top3_svm[i]==1 and horse_top3_actual[i]==1):
        TP=TP+1
    if (horse_top3_svm[i]==1 and horse_top3_actual[i]==0):
        FP=FP+1
    if (horse_top3_svm[i]==0 and horse_top3_actual[i]==1):
        FN=FN+1
    if (horse_top3_svm[i]==0 and horse_top3_actual[i]==0):
        TN=TN+1
if ((TP+FN)==0 or (TP+FP)==0):
     print("Recall of svm model horse_top3 prediction and Precision of svm model horse_top3 prediction= 0")
else:
     print("Recall of svm model horse_top3 prediction= ", TP/(TP+FN))
     print("Precision of svm model horse_top3 prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top50percent
    if (horse_top50percent_svm[i]==1 and horse_top50percent_actual[i]==1):
        TP=TP+1
    if (horse_top50percent_svm[i]==1 and horse_top50percent_actual[i]==0):
        FP=FP+1
    if (horse_top50percent_svm[i]==0 and horse_top50percent_actual[i]==1):
        FN=FN+1
    if (horse_top50percent_svm[i]==0 and horse_top50percent_actual[i]==0):
        TN=TN+1
if ((TP+FN)==0 or (TP+FP)==0):
     print("Recall of svm model horse_top50percent prediction and Precision of svm model horse_top50percent prediction= 0")
else:
     print("Recall of svm model horse_top50percent prediction= ", TP/(TP+FN))
     print("Precision of svm model horse_top50percent prediction= ", TP/(TP+FP))
#rf model
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#nb horse_win
    if (horse_win_rf[i]==1 and horse_win_actual[i]==1):
        TP=TP+1
    if (horse_win_rf[i]==1 and horse_win_actual[i]==0):
        FP=FP+1
    if (horse_win_rf[i]==0 and horse_win_actual[i]==1):
        FN=FN+1
    if (horse_win_rf[i]==0 and horse_win_actual[i]==0):
        TN=TN+1
print("Recall of rf model horse_win prediction= ", TP/(TP+FN))
print("Precision of rf model horse_win prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top3
    if (horse_top3_rf[i]==1 and horse_top3_actual[i]==1):
        TP=TP+1
    if (horse_top3_rf[i]==1 and horse_top3_actual[i]==0):
        FP=FP+1
    if (horse_top3_rf[i]==0 and horse_top3_actual[i]==1):
        FN=FN+1
    if (horse_top3_rf[i]==0 and horse_top3_actual[i]==0):
        TN=TN+1
print("Recall of rf model horse_top3 prediction= ", TP/(TP+FN))
print("Precision of rf model horse_top3 prediction= ", TP/(TP+FP))
TP=0;FP=0;FN=0;TN=0
for i in range(len(X_test)):#lr_top50percent
    if (horse_top50percent_rf[i]==1 and horse_top50percent_actual[i]==1):
        TP=TP+1
    if (horse_top50percent_rf[i]==1 and horse_top50percent_actual[i]==0):
        FP=FP+1
    if (horse_top50percent_rf[i]==0 and horse_top50percent_actual[i]==1):
        FN=FN+1
    if (horse_top50percent_rf[i]==0 and horse_top50percent_actual[i]==0):
        TN=TN+1
print("Recall of rf model horse_top50percent prediction= ", TP/(TP+FN))
print("Precision of rf model horse_top50percent prediction= ", TP/(TP+FP))


Recall of logistic model horse_win prediction=  0.7301587301587301
Precision of logistic model horse_win prediction=  0.15640347563279183
Recall of logistic model horse_top3 prediction=  0.784070796460177
Precision of logistic model horse_top3 prediction=  0.36591409691629956
Recall of logistic model horse_top50percent prediction=  0.8261120739456961
Precision of logistic model horse_top50percent prediction=  0.6059322033898306
Recall of naive bayesian model horse_win prediction=  0.4656084656084656
Precision of naive bayesian model horse_win prediction=  0.15556865055981142
Recall of naive bayesian model horse_top3 prediction=  0.424188790560472
Precision of naive bayesian model horse_top3 prediction=  0.4236888626988804
Recall of naive bayesian model horse_top50percent prediction=  0.865684575389948
Precision of naive bayesian model horse_top50percent prediction=  0.5685828116107
Recall of svm model horse_win prediction=  0.0582010582010582
Precision of svm model horse_win prediction