In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv(os.path.join('train.csv'))
test = pd.read_csv(os.path.join('test.csv'))

In [3]:
features = [
    'tvghorseweight', 
    'tvghorseage', 
    'tvghorsedaysoff', 
    'tvghorsenumberofwins', 
    'tvghorsepowerrating', 
    'tvghorseaverageclassrating',
    'tvghorseaveragespeed'
]
target = 'won'

In [4]:
X_train = train[features]
y_train = train[target].values.reshape(-1,1)

In [5]:
X_test = test[features]
y_test = test[target].values.reshape(-1,1)

# Machine Learning Models

## SVM

### Training the SVM model and finding a score

In [6]:
from sklearn.svm import SVC

from sklearn.svm import SVC 
model_svm = SVC(kernel='sigmoid')
model_svm
model_svm.fit(X_train, y_train)
train_score_svm = model_svm.score(X_train, y_train)
test_score_svm = model_svm.score(X_test, y_test)
predictions_svm = model_svm.predict(X_test)

print(f'''
R\u00b2 Scores
-----------------------
Training Score: {100*round(train_score_svm,4)}%
Test Score:     {100*round(test_score_svm,4)}%
''')



R² Scores
-----------------------
Training Score: 81.81%
Test Score:     80.4%



### SVM Classification Report

In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions_svm,
                            target_names=["won", "lost"]))

              precision    recall  f1-score   support

         win       0.89      0.89      0.89      1410
        lost       0.11      0.11      0.11       177

    accuracy                           0.80      1587
   macro avg       0.50      0.50      0.50      1587
weighted avg       0.80      0.80      0.80      1587



## XGBoost

### Training the XGBoost Classifier and finding a score

In [9]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model_XGB = XGBClassifier()
model_XGB.fit(X_train, y_train)
# make predictions for test data
y_pred = model_XGB.predict(X_test)
predictions_XGB = [value for value in y_pred]

accuracy_XGB = accuracy_score(y_test, predictions_XGB)
print("Accuracy score for testing")
print(f"Testing Accuracy: {round(accuracy_XGB,4) * 100.0}%"  )

from sklearn.model_selection import cross_val_score
scores_XGB = cross_val_score(model_XGB, X_train, y_train, cv=5)
print('Cross Validation for Training')
print(f"Training Accuracy: {round(accuracy_XGB,4)*100}%")

Accuracy score for testing
Testing Accuracy: 90.42%
Cross Validation for Training
Training Accuracy: 90.42%


### XGBoost Classification Report

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions_XGB,
                            target_names=["won", "lost"]))

              precision    recall  f1-score   support

         win       0.90      1.00      0.95      1410
        lost       1.00      0.14      0.25       177

    accuracy                           0.90      1587
   macro avg       0.95      0.57      0.60      1587
weighted avg       0.91      0.90      0.87      1587



## Naive-Bayes Gaussian Classifier

### Training the Naive-Bayes Gaussian model and finding a score

In [12]:
from sklearn.naive_bayes import GaussianNB

model_nb = GaussianNB()
model_nb.fit(X_train, y_train)

train_score_nb = model_nb.score(X_train, y_train)
test_score_nb = model_nb.score(X_test, y_test)

predictions_nb = model_nb.predict(X_test)
model_nb.score(X_test, predictions_nb)

print(f'''
R\u00b2 Scores
-----------------------
Training Score: {100*round(train_score_nb,4)}%
Test Score:     {100*round(test_score_nb,4)}%
''')


R² Scores
-----------------------
Training Score: 87.35000000000001%
Test Score:     87.78%



### Naive-Bayes Classification Report

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions_nb,
                            target_names=["win", "lost"]))

              precision    recall  f1-score   support

         win       0.89      0.99      0.93      1410
        lost       0.16      0.02      0.04       177

    accuracy                           0.88      1587
   macro avg       0.52      0.50      0.49      1587
weighted avg       0.81      0.88      0.83      1587



Creating the DataFrame for the predicted data

In [15]:
test_copy = test.copy()

In [16]:
test_copy['won_SVM'] = predictions_svm

In [17]:
test_copy['won_XGBoost'] = predictions_XGB

In [18]:
test_copy['won_GaussianNB'] = predictions_nb

In [20]:
test_copy.to_csv('test_data_and_predictions.csv', index=False)

In [21]:
final_df = pd.read_csv('test_data_and_predictions.csv')

In [22]:
final_df.columns

Index(['racedater', 'tvgtrackcode', 'race', 'bettinginterestnumber',
       'horsename', 'morninglineodds', 'currentodds', 'tvghorseweight',
       'tvghorsedamsirename', 'tvghorseage', 'tvghorsesex', 'tvghorsedaysoff',
       'tvghorsenumberofwins', 'tvghorsenumberofstarts', 'tvghorsepowerrating',
       'tvghorseaveragespeed', 'tvghorseaverageclassrating', 'currentodds.1',
       'winpayout', 'placepayout', 'showpayout', 'scratched', 'won', 'won_SVM',
       'won_XGBoost', 'won_GaussianNB'],
      dtype='object')

In [23]:
cols = [
    'racedater',
    'race',
    'horsename',
    'tvghorseweight', 
    'tvghorseage', 
    'tvghorsedaysoff', 
    'tvghorsenumberofwins', 
    'tvghorsepowerrating', 
    'tvghorseaverageclassrating',
    'tvghorseaveragespeed',
    'won',
    'won_SVM',
    'won_XGBoost',
    'won_GaussianNB'
]

selected_final_df = final_df[cols]

won_svm = selected_final_df.loc[(selected_final_df.won == 1) & (selected_final_df.won_SVM == 1)]
won_xgboost = selected_final_df.loc[(selected_final_df.won == 1) & (selected_final_df.won_XGBoost == 1)]
won_gaussian = selected_final_df.loc[(selected_final_df.won == 1) & (selected_final_df.won_GaussianNB == 1)]

lost_svm = selected_final_df.loc[(selected_final_df.won == 0) & (selected_final_df.won_SVM == 0)]
lost_xgboost = selected_final_df.loc[(selected_final_df.won == 0) & (selected_final_df.won_XGBoost == 0)]
lost_gaussian = selected_final_df.loc[(selected_final_df.won == 0) & (selected_final_df.won_GaussianNB == 0)]

In [24]:
cols_rename = {
    'racedater':'Date',
    'race':'Race',
    'horsename':'Name',
    'tvghorseweight':'Weight', 
    'tvghorseage':'Age', 
    'tvghorsedaysoff':'Days off', 
    'tvghorsenumberofwins':'Number of Wins', 
    'tvghorsepowerrating':'Power Rating', 
    'tvghorseaverageclassrating':'Average Class Rating',
    'tvghorseaveragespeed':'Average Speed',
    'won':'Won',
    'won_SVM':'Won SVM',
    'won_XGBoost':'Won XGBoost',
    'won_GaussianNB':'Won GaussianNB'
}

In [25]:
won_svm.rename(columns = cols_rename, inplace=True)
won_svm

Unnamed: 0,Date,Race,Name,Weight,Age,Days off,Number of Wins,Power Rating,Average Class Rating,Average Speed,Won,Won SVM,Won XGBoost,Won GaussianNB
77,2020-06-20,9,factor this,122,5,90,7,111.9,108,102,1,1,1,0
172,2020-06-26,8,spectacular gem,122,4,131,3,99.9,102,93,1,1,0,0
353,2020-06-07,7,lead guitar,123,4,232,1,84.2,86,89,1,1,1,0
482,2020-06-14,5,chaos theory,121,5,119,3,101.4,107,101,1,1,0,0
505,2020-06-14,8,catch a bid,122,4,239,2,94.0,96,98,1,1,0,0
526,2020-05-31,8,crystal cliffs (fr),118,3,167,0,104.2,91,97,1,1,1,0
592,2020-06-14,10,duplicity,118,3,85,0,109.0,95,100,1,1,1,0
654,2020-06-27,9,midnight bisou,124,5,118,11,107.2,112,111,1,1,1,1
661,2020-06-27,10,tom's d'etat,124,7,76,10,111.2,115,109,1,1,1,1
867,2020-06-13,11,admission office,120,5,34,3,106.9,114,105,1,1,0,0


In [26]:
won_xgboost.rename(columns = cols_rename, inplace=True)
won_xgboost

Unnamed: 0,Date,Race,Name,Weight,Age,Days off,Number of Wins,Power Rating,Average Class Rating,Average Speed,Won,Won SVM,Won XGBoost,Won GaussianNB
66,2020-06-20,8,union maiden,118,3,25,0,66.4,82,76,1,0,1,0
77,2020-06-20,9,factor this,122,5,90,7,111.9,108,102,1,1,1,0
290,2020-06-11,9,tizahra,125,4,73,0,88.3,83,71,1,0,1,0
326,2020-06-07,4,quiet company,125,4,29,0,96.7,83,69,1,0,1,0
353,2020-06-07,7,lead guitar,123,4,232,1,84.2,86,89,1,1,1,0
487,2020-06-14,6,guska mon shoes,118,10,15,11,74.4,96,85,1,0,1,1
526,2020-05-31,8,crystal cliffs (fr),118,3,167,0,104.2,91,97,1,1,1,0
539,2020-05-31,9,royal flag,122,4,113,2,102.3,87,94,1,0,1,0
592,2020-06-14,10,duplicity,118,3,85,0,109.0,95,100,1,1,1,0
599,2020-06-27,2,frost or frippery,123,7,36,12,92.0,95,92,1,0,1,1


In [27]:
won_gaussian.rename(columns = cols_rename, inplace=True)
won_gaussian

Unnamed: 0,Date,Race,Name,Weight,Age,Days off,Number of Wins,Power Rating,Average Class Rating,Average Speed,Won,Won SVM,Won XGBoost,Won GaussianNB
487,2020-06-14,6,guska mon shoes,118,10,15,11,74.4,96,85,1,0,1,1
599,2020-06-27,2,frost or frippery,123,7,36,12,92.0,95,92,1,0,1,1
654,2020-06-27,9,midnight bisou,124,5,118,11,107.2,112,111,1,1,1,1
661,2020-06-27,10,tom's d'etat,124,7,76,10,111.2,115,109,1,1,1,1


In [28]:
won_svm.to_csv('won_svm.csv', index=False)

In [29]:
won_xgboost.to_csv('won_xgboost.csv', index=False)

In [30]:
won_gaussian.to_csv('won_gaussian.csv', index=False)

In [31]:
lost_svm.rename(columns = cols_rename, inplace=True)
lost_svm

Unnamed: 0,Date,Race,Name,Weight,Age,Days off,Number of Wins,Power Rating,Average Class Rating,Average Speed,Won,Won SVM,Won XGBoost,Won GaussianNB
0,2020-06-20,1,epic west,118,3,160,0,27.7,92,1,0,0,0,0
1,2020-06-20,1,spelling bee,118,3,28,0,68.4,78,69,0,0,0,0
3,2020-06-20,1,ugo,118,3,13,0,66.8,84,64,0,0,0,0
4,2020-06-20,1,conquistador show,118,3,105,0,65.8,81,65,0,0,0,0
5,2020-06-20,1,suspect,124,4,20,0,72.8,86,64,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,2020-05-29,9,uncle momo,123,4,54,1,77.3,87,69,0,0,0,0
1582,2020-05-29,9,bebop shoes,123,5,26,1,88.6,96,83,0,0,0,0
1583,2020-05-29,9,irish spirit,123,5,24,0,78.4,97,89,0,0,0,0
1584,2020-05-29,9,alex's strike,123,4,35,1,84.4,87,80,0,0,0,0


In [32]:
lost_svm.to_csv('lost_svm.csv', index=False)

In [33]:
lost_xgboost.rename(columns = cols_rename, inplace=True)
lost_xgboost

Unnamed: 0,Date,Race,Name,Weight,Age,Days off,Number of Wins,Power Rating,Average Class Rating,Average Speed,Won,Won SVM,Won XGBoost,Won GaussianNB
0,2020-06-20,1,epic west,118,3,160,0,27.7,92,1,0,0,0,0
1,2020-06-20,1,spelling bee,118,3,28,0,68.4,78,69,0,0,0,0
3,2020-06-20,1,ugo,118,3,13,0,66.8,84,64,0,0,0,0
4,2020-06-20,1,conquistador show,118,3,105,0,65.8,81,65,0,0,0,0
5,2020-06-20,1,suspect,124,4,20,0,72.8,86,64,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,2020-05-29,9,uncle momo,123,4,54,1,77.3,87,69,0,0,0,0
1582,2020-05-29,9,bebop shoes,123,5,26,1,88.6,96,83,0,0,0,0
1583,2020-05-29,9,irish spirit,123,5,24,0,78.4,97,89,0,0,0,0
1584,2020-05-29,9,alex's strike,123,4,35,1,84.4,87,80,0,0,0,0


In [34]:
lost_xgboost.to_csv('lost_xgboost.csv', index=False)

In [35]:
lost_gaussian.rename(columns = cols_rename, inplace=True)
lost_gaussian

Unnamed: 0,Date,Race,Name,Weight,Age,Days off,Number of Wins,Power Rating,Average Class Rating,Average Speed,Won,Won SVM,Won XGBoost,Won GaussianNB
0,2020-06-20,1,epic west,118,3,160,0,27.7,92,1,0,0,0,0
1,2020-06-20,1,spelling bee,118,3,28,0,68.4,78,69,0,0,0,0
3,2020-06-20,1,ugo,118,3,13,0,66.8,84,64,0,0,0,0
4,2020-06-20,1,conquistador show,118,3,105,0,65.8,81,65,0,0,0,0
5,2020-06-20,1,suspect,124,4,20,0,72.8,86,64,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,2020-05-29,9,uncle momo,123,4,54,1,77.3,87,69,0,0,0,0
1582,2020-05-29,9,bebop shoes,123,5,26,1,88.6,96,83,0,0,0,0
1583,2020-05-29,9,irish spirit,123,5,24,0,78.4,97,89,0,0,0,0
1584,2020-05-29,9,alex's strike,123,4,35,1,84.4,87,80,0,0,0,0


In [36]:
lost_gaussian.to_csv('lost_gaussian.csv', index=False)