In [14]:
# kNN model
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# SQL setup
from sqlalchemy import create_engine
# connect to a local database
engine = create_engine("sqlite:///boxscores.db")

In [4]:
# load in data
df_2020=pd.read_sql('2020',engine)
df_2021=pd.read_sql('2021',engine)
df_2022=pd.read_sql('2022',engine)

In [5]:
# concatenate them together
all_years=[df_2020,df_2021,df_2022]
df=pd.concat(all_years)

In [24]:
# Select new group of features
# model 9 batting only
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','RE24_b']]
y=df['Won']
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

#k_values=list(range(1, 100, 2))
k_values=[33]
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred_test=neigh.predict(X_test_scaled)
    y_pred_train=neigh.predict(X_train_scaled)
    
    train_acc_score=neigh.score(X_train_scaled,y_train)
    train_prec_score=precision_score(y_train,y_pred_train)
    train_recall_score=recall_score(y_train,y_pred_train)
    confusion_matrix_train=confusion_matrix(y_train, y_pred_train)
    
    test_acc_score=neigh.score(X_test_scaled,y_test)
    test_prec_score=precision_score(y_test,y_pred_test)
    test_recall_score=recall_score(y_test,y_pred_test)
    confusion_matrix_test=confusion_matrix(y_test,y_pred_test)
    
    
    print('K= '+str(k))
    print('Train Acc Score= '+str(train_acc_score))
    print('Train Precision Score= '+str(train_prec_score))
    print('Train Recall Score= '+str(train_recall_score))
    print(confusion_matrix_train)
    print('Test Acc Score= '+str(test_acc_score))
    print('Test Precision Score= '+str(test_prec_score))
    print('Test Recall Score= '+str(test_recall_score))
    print(confusion_matrix_test)
    print('')




K= 33
Train Acc Score= 0.7875252088735235
Train Precision Score= 0.7915697674418605
Train Recall Score= 0.7822464808962941
[[2744  717]
 [ 758 2723]]
Test Acc Score= 0.783923941227312
Test Precision Score= 0.7914338919925512
Test Recall Score= 0.7548845470692718
[[964 224]
 [276 850]]



In [32]:
# Select new group of features
# model 10 pitching/defense only
X=df[['H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','RE24_p','E']]
y=df['Won']
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

#k_values=list(range(1, 100, 2))
k_values=[53]
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred_test=neigh.predict(X_test_scaled)
    y_pred_train=neigh.predict(X_train_scaled)
    
    train_acc_score=neigh.score(X_train_scaled,y_train)
    train_prec_score=precision_score(y_train,y_pred_train)
    train_recall_score=recall_score(y_train,y_pred_train)
    confusion_matrix_train=confusion_matrix(y_train, y_pred_train)
    
    test_acc_score=neigh.score(X_test_scaled,y_test)
    test_prec_score=precision_score(y_test,y_pred_test)
    test_recall_score=recall_score(y_test,y_pred_test)
    confusion_matrix_test=confusion_matrix(y_test,y_pred_test)
    
    
    print('K= '+str(k))
    print('Train Acc Score= '+str(train_acc_score))
    print('Train Precision Score= '+str(train_prec_score))
    print('Train Recall Score= '+str(train_recall_score))
    print(confusion_matrix_train)
    print('Test Acc Score= '+str(test_acc_score))
    print('Test Precision Score= '+str(test_prec_score))
    print('Test Recall Score= '+str(test_recall_score))
    print(confusion_matrix_test)
    print('')




K= 53
Train Acc Score= 0.7814750792278882
Train Precision Score= 0.7668478260869566
Train Recall Score= 0.8106865843148521
[[2603  858]
 [ 659 2822]]
Test Acc Score= 0.7783059636992221
Test Precision Score= 0.7552039966694422
Test Recall Score= 0.80550621669627
[[894 294]
 [219 907]]



In [35]:
# Select new group of features
# model 11 Batting + Pitching/Defense, no RE24_b and no RE24_p
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','E']]
y=df['Won']
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

#k_values=list(range(1, 100, 2))
k_values=[77]
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred_test=neigh.predict(X_test_scaled)
    y_pred_train=neigh.predict(X_train_scaled)
    
    train_acc_score=neigh.score(X_train_scaled,y_train)
    train_prec_score=precision_score(y_train,y_pred_train)
    train_recall_score=recall_score(y_train,y_pred_train)
    confusion_matrix_train=confusion_matrix(y_train, y_pred_train)
    
    test_acc_score=neigh.score(X_test_scaled,y_test)
    test_prec_score=precision_score(y_test,y_pred_test)
    test_recall_score=recall_score(y_test,y_pred_test)
    confusion_matrix_test=confusion_matrix(y_test,y_pred_test)
    
    
    print('K= '+str(k))
    print('Train Acc Score= '+str(train_acc_score))
    print('Train Precision Score= '+str(train_prec_score))
    print('Train Recall Score= '+str(train_recall_score))
    print(confusion_matrix_train)
    print('Test Acc Score= '+str(test_acc_score))
    print('Test Precision Score= '+str(test_prec_score))
    print('Test Recall Score= '+str(test_recall_score))
    print(confusion_matrix_test)
    print('')




K= 77
Train Acc Score= 0.9235090751944685
Train Precision Score= 0.9110925306577481
Train Recall Score= 0.9390979603562195
[[3142  319]
 [ 212 3269]]
Test Acc Score= 0.9196197061365601
Test Precision Score= 0.9010238907849829
Test Recall Score= 0.9378330373001776
[[1072  116]
 [  70 1056]]



In [38]:
# Select new group of features
# model 17 from logistic regression
X=df[['OPS','RBI','BA_RISP','ERA','BB_p','E']]
y=df['Won']


# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

#k_values=list(range(1, 100, 2))
k_values=[21]
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred_test=neigh.predict(X_test_scaled)
    y_pred_train=neigh.predict(X_train_scaled)
    
    train_acc_score=neigh.score(X_train_scaled,y_train)
    train_prec_score=precision_score(y_train,y_pred_train)
    train_recall_score=recall_score(y_train,y_pred_train)
    confusion_matrix_train=confusion_matrix(y_train, y_pred_train)
    
    test_acc_score=neigh.score(X_test_scaled,y_test)
    test_prec_score=precision_score(y_test,y_pred_test)
    test_recall_score=recall_score(y_test,y_pred_test)
    confusion_matrix_test=confusion_matrix(y_test,y_pred_test)
    
    
    print('K= '+str(k))
    print('Train Acc Score= '+str(train_acc_score))
    print('Train Precision Score= '+str(train_prec_score))
    print('Train Recall Score= '+str(train_recall_score))
    print(confusion_matrix_train)
    print('Test Acc Score= '+str(test_acc_score))
    print('Test Precision Score= '+str(test_prec_score))
    print('Test Recall Score= '+str(test_recall_score))
    print(confusion_matrix_test)
    print('')




K= 21
Train Acc Score= 0.9530394698934025
Train Precision Score= 0.9493021931073768
Train Recall Score= 0.9574834817581155
[[3283  178]
 [ 148 3333]]
Test Acc Score= 0.945980985306828
Test Precision Score= 0.940193491644679
Test Recall Score= 0.9493783303730018
[[1120   68]
 [  57 1069]]



In [41]:
# model 17 validation

y_pred_val=neigh.predict(X_val_scaled)

val_acc_score=neigh.score(X_val_scaled,y_val)
val_prec_score=precision_score(y_val,y_pred_val)
val_recall_score=recall_score(y_val,y_pred_val)
confusion_matrix_val=confusion_matrix(y_val,y_pred_val)
    
    
print('K= '+str(k))
print('Val Acc Score= '+str(val_acc_score))
print('Val Precision Score= '+str(val_prec_score))
print('Val Recall Score= '+str(val_recall_score))
print(confusion_matrix_val)

print('')

K= 21
Val Acc Score= 0.9537597234226448
Val Precision Score= 0.9534292972057579
Val Recall Score= 0.9558573853989814
[[1081   55]
 [  52 1126]]

