In [12]:
import pandas as pd
import numpy as np
from pprint import pprint
from sqlalchemy import create_engine
from keys import sqlkey
from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
import joblib



In [13]:
#CREATE DATABASE CONNECTION
engine = create_engine('postgresql://postgres:'+sqlkey+'@localhost:5432/horse_races')
connection = engine.connect()

In [14]:
data_df = pd.read_sql(sql='select * from best_ranked_data',con=connection)

In [15]:
connection.close()

In [16]:
data_df

Unnamed: 0,race_id,horse_id,won,distance,race_class,sec_time1,sec_time2,sec_time3,sec_time4,ldr_time1,...,behind_sec1,behind_sec2,behind_sec3,behind_sec4,time1,time2,time3,time4,win_odds,place_odds


In [8]:
data_df = data_df.dropna()

In [9]:
data_df

Unnamed: 0,race_id,horse_id,won,distance,race_class,sec_time1,sec_time2,sec_time3,sec_time4,ldr_time1,...,behind_sec1,behind_sec2,behind_sec3,behind_sec4,time1,time2,time3,time4,win_odds,place_odds


In [11]:
from sklearn.model_selection import train_test_split

X_smote = data_df.drop(columns=["won"])

y_smote = data_df["won"]


X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, random_state=1, stratify = y_smote)

ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [11]:

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(X_train_smote, y_train_smote)


model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test_smote)
balanced_accuracy_score(y_test_smote, y_pred)
confusion_matrix(y_test_smote, y_pred)
print(classification_report_imbalanced(y_test_smote, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00     10164
          1       0.97      1.00      1.00      0.98      1.00      1.00       868

avg / total       1.00      1.00      1.00      1.00      1.00      1.00     11032



In [12]:
# X_train_best.head(50)

X_train_smote.to_csv("X_train_smote.csv")
y_train_smote.to_csv("y_train_smote.csv")

X_test_smote.to_csv("X_test_smote.csv")
y_test_smote.to_csv("y_test_smote.csv")

In [13]:
# print(f"Training Data Score: {model.score(X_train_scaled_best, y_train_best)}")
# print(f"Testing Data Score: {model.score(X_test_scaled_best, y_test_best)}")
# print(f"for one single line:{model.predict(X_train_scaled_best[12].reshape(1,-1))}")

In [14]:
model.coef_

array([[ 1.07186982e-06, -1.71315765e-05, -1.02404674e-03,
        -5.26603009e-02, -1.44194656e-01, -2.72862448e-01,
        -4.25629248e-01,  3.27668214e+00, -1.44194656e-01,
        -4.17057104e-01, -8.42686352e-01,  2.43399579e+00,
        -3.07767670e+01,  1.93379453e-02, -5.58264656e-02,
        -8.46558807e-01,  1.21059263e+00, -8.76090813e-01,
        -8.21911015e-01, -1.17268034e+00, -5.54683350e+00,
        -1.14645333e-02,  4.18957671e-02]])

In [15]:
sorted(zip(model.coef_[0], np.asarray(X_smote.columns)), reverse=True)

[(3.2766821391032104, 'sec_time4'),
 (2.433995787070235, 'ldr_time4'),
 (1.210592627785416, 'behind_sec4'),
 (0.04189576710471376, 'place_odds'),
 (0.01933794528992767, 'behind_sec1'),
 (1.0718698194407978e-06, 'race_id'),
 (-1.713157645996128e-05, 'horse_id'),
 (-0.001024046737486871, 'distance'),
 (-0.011464533336377898, 'win_odds'),
 (-0.0526603009345888, 'race_class'),
 (-0.05582646560813039, 'behind_sec2'),
 (-0.1441946562200837, 'sec_time1'),
 (-0.1441946562200837, 'ldr_time1'),
 (-0.2728624481339311, 'sec_time2'),
 (-0.41705710430456594, 'ldr_time2'),
 (-0.4256292477312175, 'sec_time3'),
 (-0.8219110146459716, 'time2'),
 (-0.8426863520943787, 'ldr_time3'),
 (-0.8465588065971713, 'behind_sec3'),
 (-0.876090812636561, 'time1'),
 (-1.1726803389941436, 'time3'),
 (-5.5468334974224405, 'time4'),
 (-30.776766984984356, 'lengths_behind')]

In [None]:
# i = 0
# while i < 5000:
#     print(f"{i}for one single line:{horse_model_best.predict(X_train_scaled_best[i].reshape(1,-1))}")
#     i+=1

In [None]:

# X_train_smote
# X_train_best.columns

In [16]:
filename = 'horse_model_smote.sav'
joblib.dump(model, filename)

['horse_model_smote.sav']

In [17]:
model = joblib.load('horse_model_smote.sav')

In [18]:
Xtest = pd.DataFrame([[4871,1619,1800,3,14.05,21.86,24.17,24.31,14.05,35.91,60.08,84.39,0,1.75,4.25,3.5,1,14.33,22.26,24.05,23.91,4.9,1.0
]])



In [19]:
print(f"for one single line:{model.predict(Xtest)}")



for one single line:[1]


In [20]:
#CREATE DATABASE CONNECTION ONLY DO THESE ONCE EVEN IF RUNNING MODEL AGAIN...
engine = create_engine('postgresql://postgres:'+sqlkey+'@localhost:5432/horse_races')
connection = engine.connect()

In [21]:
#Need to push data being used for tableau
#PUSH DATAFRAME TO POSTGRESQL ONLY PUSH ONCE
data_df.to_sql(name='best_data_set', con=connection, if_exists='append', index=False)

In [22]:
connection.close()