# `logistic_regression_without_race_without_weather_pairwise_winners.ipynb`

### Author: Anthony Hein

#### Last updated: 11/20/2021

# Overview:

Use the datasets
* `X_train_preprocess_without_race.csv`
* `X_dev_preprocess_without_race.csv`
* `X_test_preprocess_without_race.csv`

and the targets in
* `X_train_pairwise_winner_labels.csv`
* `X_dev_pairwise_winner_labels.csv`
* `X_test_pairwise_winner_labels.csv`

to make a logistic regression model that tries to predict the pairwise winner between each pair of runners.

---

## Setup

In [1]:
from datetime import datetime
import git
import os
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

---

## Remove Features that Involve Weather

In [3]:
drop_cols = [
    'horse1_jockey_prev_1_position_temp',
    'horse1_jockey_prev_2_position_temp',
    'horse1_jockey_prev_3_position_temp',
    'horse1_jockey_prev_1_finishing_time_ratio_temp',
    'horse1_jockey_prev_2_finishing_time_ratio_temp',
    'horse1_jockey_prev_3_finishing_time_ratio_temp',
    'horse1_jockey_prev_1_position_msl',
    'horse1_jockey_prev_2_position_msl',
    'horse1_jockey_prev_3_position_msl',
    'horse1_jockey_prev_1_finishing_time_ratio_msl',
    'horse1_jockey_prev_2_finishing_time_ratio_msl',
    'horse1_jockey_prev_3_finishing_time_ratio_msl',
    'horse1_jockey_prev_1_position_rain',
    'horse1_jockey_prev_2_position_rain',
    'horse1_jockey_prev_3_position_rain',
    'horse1_jockey_prev_1_finishing_time_ratio_rain',
    'horse1_jockey_prev_2_finishing_time_ratio_rain',
    'horse1_jockey_prev_3_finishing_time_ratio_rain',
    'horse1_jockey_prev_1_position_rhum',
    'horse1_jockey_prev_2_position_rhum',
    'horse1_jockey_prev_3_position_rhum',
    'horse1_jockey_prev_1_finishing_time_ratio_rhum',
    'horse1_jockey_prev_2_finishing_time_ratio_rhum',
    'horse1_jockey_prev_3_finishing_time_ratio_rhum',
    'horse2_jockey_prev_1_position_temp',
    'horse2_jockey_prev_2_position_temp',
    'horse2_jockey_prev_3_position_temp',
    'horse2_jockey_prev_1_finishing_time_ratio_temp',
    'horse2_jockey_prev_2_finishing_time_ratio_temp',
    'horse2_jockey_prev_3_finishing_time_ratio_temp',
    'horse2_jockey_prev_1_position_msl',
    'horse2_jockey_prev_2_position_msl',
    'horse2_jockey_prev_3_position_msl',
    'horse2_jockey_prev_1_finishing_time_ratio_msl',
    'horse2_jockey_prev_2_finishing_time_ratio_msl',
    'horse2_jockey_prev_3_finishing_time_ratio_msl',
    'horse2_jockey_prev_1_position_rain',
    'horse2_jockey_prev_2_position_rain',
    'horse2_jockey_prev_3_position_rain',
    'horse2_jockey_prev_1_finishing_time_ratio_rain',
    'horse2_jockey_prev_2_finishing_time_ratio_rain',
    'horse2_jockey_prev_3_finishing_time_ratio_rain',
    'horse2_jockey_prev_1_position_rhum',
    'horse2_jockey_prev_2_position_rhum',
    'horse2_jockey_prev_3_position_rhum',
    'horse2_jockey_prev_1_finishing_time_ratio_rhum',
    'horse2_jockey_prev_2_finishing_time_ratio_rhum',
    'horse2_jockey_prev_3_finishing_time_ratio_rhum'
]

---

## Load `X_train_preprocess_without_race.csv`

In [4]:
X_train = pd.read_csv(f"{BASE_DIR}/data/analysis/X_train_preprocess_without_race.csv", low_memory=False)
X_train.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.2,0.230769,0.19697,0.0,0.0,0.73125,0.692308,0.007631,0.236544,0.1,...,0.05,0.805104,0.780139,0.783438,0.1,0.025,0.025,0.805104,0.799642,0.780139
1,0.4,0.038462,0.072222,0.0,0.0,0.73125,0.74359,0.00505,0.236544,0.1,...,0.1,0.801416,0.781238,0.79416,0.075,0.05,0.075,0.801416,0.800487,0.785703
2,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.1,0.799909,0.780139,0.786816,0.025,0.1,0.025,0.799642,0.806486,0.780139
3,0.133333,0.423077,0.098485,0.0,0.0,0.65,0.705128,0.00074,0.237283,0.025,...,0.075,0.800176,0.781623,0.78187,0.05,0.075,0.05,0.801163,0.801416,0.780963
4,0.133333,0.192308,0.090278,0.0,0.0,0.6625,0.730769,0.00074,0.234655,0.05,...,0.1,0.799642,0.783601,0.785468,0.125,0.1,0.025,0.803191,0.805104,0.780139


In [5]:
X_train.shape

(800666, 144)

In [6]:
X_train = X_train.drop(columns=drop_cols)
X_train.shape

(800666, 96)

---

## Load `X_dev_preprocess_without_race.csv`

In [7]:
X_dev = pd.read_csv(f"{BASE_DIR}/data/analysis/X_dev_preprocess_without_race.csv", low_memory=False)
X_dev.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.285714,0.105263,0.2,0.0,0.0,0.608392,0.769231,0.001516,0.629407,0.25,...,0.2,0.828972,0.837641,0.843255,0.075,0.05,0.275,0.834583,0.828972,0.853796
1,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.075,0.826039,0.826039,0.842799,0.025,0.025,0.075,0.826039,0.826039,0.842799
2,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.125,0.826413,0.841758,0.844367,0.125,0.05,0.275,0.843955,0.826413,0.841758
3,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.05,0.832401,0.832114,0.828366,0.1,0.075,0.05,0.832401,0.837064,0.828519
4,0.285714,0.210526,0.157143,0.0,0.0,0.258741,0.769231,0.000445,0.286087,0.075,...,0.175,0.857255,0.835266,0.834946,0.15,0.025,0.2,0.857255,0.826039,0.837392


In [8]:
X_dev.shape

(228766, 144)

In [9]:
X_dev = X_dev.drop(columns=drop_cols)
X_dev.shape

(228766, 96)

---

## Load `X_test_preprocess_without_race.csv`

In [10]:
X_test = pd.read_csv(f"{BASE_DIR}/data/analysis/X_test_preprocess_without_race.csv", low_memory=False)
X_test.head()

Unnamed: 0,horse1_age,horse1_saddle,horse1_decimalPrice,horse1_isFav,horse1_outHandicap,horse1_RPR,horse1_weight,horse1_jockey_d_last_race,horse1_jockey_d_first_race,horse1_jockey_prev_1_position,...,horse2_jockey_prev_3_position_rain,horse2_jockey_prev_1_finishing_time_ratio_rain,horse2_jockey_prev_2_finishing_time_ratio_rain,horse2_jockey_prev_3_finishing_time_ratio_rain,horse2_jockey_prev_1_position_rhum,horse2_jockey_prev_2_position_rhum,horse2_jockey_prev_3_position_rhum,horse2_jockey_prev_1_finishing_time_ratio_rhum,horse2_jockey_prev_2_finishing_time_ratio_rhum,horse2_jockey_prev_3_finishing_time_ratio_rhum
0,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.2,0.865544,0.847264,0.853154,0.025,0.2,0.3,0.840533,0.850856,0.864844
1,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.15,0.864781,0.840533,0.855519,0.025,0.025,0.025,0.840533,0.840533,0.840533
2,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.075,0.838534,0.843337,0.846422,0.2,0.1,0.125,0.853412,0.842583,0.847859
3,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.25,0.851685,0.849673,0.854624,0.15,0.25,0.2,0.849673,0.854624,0.858051
4,0.230769,0.333333,0.03268,0.0,0.0,0.344828,0.74359,0.001053,0.49501,0.225,...,0.25,0.847892,0.842776,0.862423,0.175,0.075,0.275,0.853634,0.844529,0.862845


In [11]:
X_test.shape

(114392, 144)

In [12]:
X_test = X_test.drop(columns=drop_cols)
X_test.shape

(114392, 96)

---

## Load Pairwise Winner Labels

In [13]:
X_train_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/x_train_pairwise_winner_labels.csv",
                                            dtype=int,
                                            delimiter=',')

In [14]:
X_dev_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_dev_pairwise_winner_labels.csv",
                                          dtype=int,
                                          delimiter=',')

In [15]:
X_test_pairwise_winner_labels = np.loadtxt(f"{BASE_DIR}/data/analysis/X_test_pairwise_winner_labels.csv",
                                           dtype=int,
                                           delimiter=',')

---

## Logistic Regression Model

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import PredefinedSplit, GridSearchCV

## Fixing Random Seed for Reproducibility
np.random.seed(0)

In [17]:
## Hyperparameter Space
space = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'C': [0.01, 0.1, 10, 100],
}

In [18]:
X_train_plus_dev = pd.concat([X_train, X_dev])
X_train_plus_dev.shape

(1029432, 96)

In [19]:
X_train_plus_dev_pairwise_winner_labels = np.concatenate(
    (X_train_pairwise_winner_labels, X_dev_pairwise_winner_labels)
)
X_train_plus_dev_pairwise_winner_labels

array([1, 0, 1, ..., 1, 1, 1])

In [20]:
len(X_train), len(X_dev)

(800666, 228766)

In [21]:
test_fold = np.concatenate(
    (np.full(len(X_train), -1),
     np.zeros(len(X_dev)))
)
test_fold

array([-1., -1., -1., ...,  0.,  0.,  0.])

In [22]:
## Predefined Split
ps = PredefinedSplit(test_fold)
ps

PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0]))

In [23]:
lr = LogisticRegression(max_iter=1000)

In [24]:
lr = GridSearchCV(lr, space, verbose=25, refit=False, cv=ps)

In [25]:
search = lr.fit(X_train_plus_dev.to_numpy(), X_train_plus_dev_pairwise_winner_labels)

Fitting 1 folds for each of 48 candidates, totalling 48 fits
[CV 1/1; 1/48] START C=0.01, penalty=none, solver=newton-cg.....................




[CV 1/1; 1/48] END C=0.01, penalty=none, solver=newton-cg;, score=0.943 total time= 1.4min
[CV 1/1; 2/48] START C=0.01, penalty=none, solver=lbfgs.........................




[CV 1/1; 2/48] END C=0.01, penalty=none, solver=lbfgs;, score=0.943 total time=  58.5s
[CV 1/1; 3/48] START C=0.01, penalty=none, solver=liblinear.....................
[CV 1/1; 3/48] END C=0.01, penalty=none, solver=liblinear;, score=nan total time=   0.7s
[CV 1/1; 4/48] START C=0.01, penalty=l1, solver=newton-cg.......................
[CV 1/1; 4/48] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.5s
[CV 1/1; 5/48] START C=0.01, penalty=l1, solver=lbfgs...........................
[CV 1/1; 5/48] END C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.6s
[CV 1/1; 6/48] START C=0.01, penalty=l1, solver=liblinear.......................
[CV 1/1; 6/48] END C=0.01, penalty=l1, solver=liblinear;, score=0.942 total time= 7.8min
[CV 1/1; 7/48] START C=0.01, penalty=l2, solver=newton-cg.......................
[CV 1/1; 7/48] END C=0.01, penalty=l2, solver=newton-cg;, score=0.926 total time=  27.9s
[CV 1/1; 8/48] START C=0.01, penalty=l2, solver=lbfgs..................



[CV 1/1; 13/48] END C=0.1, penalty=none, solver=newton-cg;, score=0.943 total time= 1.2min
[CV 1/1; 14/48] START C=0.1, penalty=none, solver=lbfgs.........................




[CV 1/1; 14/48] END C=0.1, penalty=none, solver=lbfgs;, score=0.943 total time=  48.5s
[CV 1/1; 15/48] START C=0.1, penalty=none, solver=liblinear.....................
[CV 1/1; 15/48] END C=0.1, penalty=none, solver=liblinear;, score=nan total time=   0.5s
[CV 1/1; 16/48] START C=0.1, penalty=l1, solver=newton-cg.......................
[CV 1/1; 16/48] END C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.5s
[CV 1/1; 17/48] START C=0.1, penalty=l1, solver=lbfgs...........................
[CV 1/1; 17/48] END C=0.1, penalty=l1, solver=lbfgs;, score=nan total time=   0.5s
[CV 1/1; 18/48] START C=0.1, penalty=l1, solver=liblinear.......................
[CV 1/1; 18/48] END C=0.1, penalty=l1, solver=liblinear;, score=0.943 total time=27.9min
[CV 1/1; 19/48] START C=0.1, penalty=l2, solver=newton-cg.......................
[CV 1/1; 19/48] END C=0.1, penalty=l2, solver=newton-cg;, score=0.939 total time=  39.0s
[CV 1/1; 20/48] START C=0.1, penalty=l2, solver=lbfgs..................



[CV 1/1; 25/48] END C=10, penalty=none, solver=newton-cg;, score=0.943 total time= 1.1min
[CV 1/1; 26/48] START C=10, penalty=none, solver=lbfgs..........................




[CV 1/1; 26/48] END C=10, penalty=none, solver=lbfgs;, score=0.943 total time=  53.0s
[CV 1/1; 27/48] START C=10, penalty=none, solver=liblinear......................
[CV 1/1; 27/48] END C=10, penalty=none, solver=liblinear;, score=nan total time=   0.5s
[CV 1/1; 28/48] START C=10, penalty=l1, solver=newton-cg........................
[CV 1/1; 28/48] END C=10, penalty=l1, solver=newton-cg;, score=nan total time=   0.5s
[CV 1/1; 29/48] START C=10, penalty=l1, solver=lbfgs............................
[CV 1/1; 29/48] END C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.5s
[CV 1/1; 30/48] START C=10, penalty=l1, solver=liblinear........................
[CV 1/1; 30/48] END C=10, penalty=l1, solver=liblinear;, score=0.943 total time=84.7min
[CV 1/1; 31/48] START C=10, penalty=l2, solver=newton-cg........................
[CV 1/1; 31/48] END C=10, penalty=l2, solver=newton-cg;, score=0.943 total time= 1.0min
[CV 1/1; 32/48] START C=10, penalty=l2, solver=lbfgs.........................



[CV 1/1; 37/48] END C=100, penalty=none, solver=newton-cg;, score=0.943 total time= 1.2min
[CV 1/1; 38/48] START C=100, penalty=none, solver=lbfgs.........................




[CV 1/1; 38/48] END C=100, penalty=none, solver=lbfgs;, score=0.943 total time=  48.0s
[CV 1/1; 39/48] START C=100, penalty=none, solver=liblinear.....................
[CV 1/1; 39/48] END C=100, penalty=none, solver=liblinear;, score=nan total time=   0.5s
[CV 1/1; 40/48] START C=100, penalty=l1, solver=newton-cg.......................
[CV 1/1; 40/48] END C=100, penalty=l1, solver=newton-cg;, score=nan total time=   0.5s
[CV 1/1; 41/48] START C=100, penalty=l1, solver=lbfgs...........................
[CV 1/1; 41/48] END C=100, penalty=l1, solver=lbfgs;, score=nan total time=   0.5s
[CV 1/1; 42/48] START C=100, penalty=l1, solver=liblinear.......................
[CV 1/1; 42/48] END C=100, penalty=l1, solver=liblinear;, score=0.943 total time=88.2min
[CV 1/1; 43/48] START C=100, penalty=l2, solver=newton-cg.......................
[CV 1/1; 43/48] END C=100, penalty=l2, solver=newton-cg;, score=0.943 total time= 1.1min
[CV 1/1; 44/48] START C=100, penalty=l2, solver=lbfgs..................

24 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear so

In [26]:
print(f"Best Score: {search.best_score_}")
print(f"Best Hyperparameters: {search.best_params_}")

Best Score: 0.9430728342498448
Best Hyperparameters: {'C': 0.01, 'penalty': 'none', 'solver': 'lbfgs'}


In [27]:
pd.DataFrame(search.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
1,58.446221,0.0,0.048647,0.0,0.01,none,lbfgs,"{'C': 0.01, 'penalty': 'none', 'solver': 'lbfgs'}",0.943073,0.943073,0.0,1
25,52.934558,0.0,0.056634,0.0,10.0,none,lbfgs,"{'C': 10, 'penalty': 'none', 'solver': 'lbfgs'}",0.943073,0.943073,0.0,1
37,47.96895,0.0,0.054066,0.0,100.0,none,lbfgs,"{'C': 100, 'penalty': 'none', 'solver': 'lbfgs'}",0.943073,0.943073,0.0,1
13,48.460661,0.0,0.052436,0.0,0.1,none,lbfgs,"{'C': 0.1, 'penalty': 'none', 'solver': 'lbfgs'}",0.943073,0.943073,0.0,1
0,86.506494,0.0,0.243583,0.0,0.01,none,newton-cg,"{'C': 0.01, 'penalty': 'none', 'solver': 'newt...",0.943047,0.943047,0.0,5
24,65.907702,0.0,0.056217,0.0,10.0,none,newton-cg,"{'C': 10, 'penalty': 'none', 'solver': 'newton...",0.943047,0.943047,0.0,5
36,71.469281,0.0,0.054061,0.0,100.0,none,newton-cg,"{'C': 100, 'penalty': 'none', 'solver': 'newto...",0.943047,0.943047,0.0,5
12,69.134254,0.0,0.048242,0.0,0.1,none,newton-cg,"{'C': 0.1, 'penalty': 'none', 'solver': 'newto...",0.943047,0.943047,0.0,5
44,46.755374,0.0,0.061386,0.0,100.0,l2,liblinear,"{'C': 100, 'penalty': 'l2', 'solver': 'libline...",0.943042,0.943042,0.0,9
29,5082.06063,0.0,0.510089,0.0,10.0,l1,liblinear,"{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}",0.943029,0.943029,0.0,10


In [28]:
pd.DataFrame(search.cv_results_).to_csv(
    f"{BASE_DIR}/data/analysis/logistic_regression/without_race_without_weather.csv", index=False
)

In [29]:
import pickle

lr = LogisticRegression(
    max_iter=1000,
    C=100,
    penalty='none',
    solver='lbfgs',
).fit(X_train.to_numpy(), X_train_pairwise_winner_labels)

with open(f"{BASE_DIR}/data/analysis/logistic_regression/without_race_without_weather.pkl",'wb') as f:
    pickle.dump(lr, f)



---