In [72]:
# kNN model
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# SQL setup
from sqlalchemy import create_engine
# connect to a local database
engine = create_engine("sqlite:///boxscores.db")

In [5]:
# load in data
df_2020=pd.read_sql('2020',engine)
df_2021=pd.read_sql('2021',engine)
df_2022=pd.read_sql('2022',engine)

In [104]:
# concatenate them together
all_years=[df_2020,df_2021,df_2022]
df=pd.concat(all_years)

In [7]:
# model 1
# select data to use
X=df[['AB','R_b','H_b','BA_RISP','H_RISP','HR_b','RBI','BB_b','PA','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','R_p','ER','BB_p','HR_p','ERA','BF','Pit_p','Str_p','Ctct','StL','FB','LD','GSc','IR','IS','RE24_p','E']]
y=df['Won']

In [12]:
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [13]:
# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)



In [14]:
neigh=KNeighborsClassifier()

In [15]:
neigh.fit(X_train_scaled,y_train)

KNeighborsClassifier()

In [16]:
# neigh.predict

y_pred=neigh.predict(X_test_scaled)

In [17]:
neigh.score(X_train_scaled,y_train)

0.9753673292999135

In [18]:
neigh.score(X_test_scaled,y_test)

0.9451166810717373

In [19]:
neigh.predict_proba(X_test_scaled)

array([[0.8, 0.2],
       [0. , 1. ],
       [0. , 1. ],
       ...,
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ]])

In [20]:
confusion_matrix(y_test, y_pred)

array([[1120,   68],
       [  59, 1067]])

In [21]:
k_values=list(range(1, 100, 2))

In [22]:
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')

K= 1
Train Score= 1.0
Test Score= 0.9165946413137425

K= 3
Train Score= 0.977384039181792
Test Score= 0.9377700950734659

K= 5
Train Score= 0.9753673292999135
Test Score= 0.9451166810717373

K= 7
Train Score= 0.9740708729472775
Test Score= 0.9520311149524633

K= 9
Train Score= 0.9740708729472775
Test Score= 0.9515989628349178

K= 11
Train Score= 0.9727744165946414
Test Score= 0.9589455488331893

K= 13
Train Score= 0.9733506194180351
Test Score= 0.958513396715644

K= 15
Train Score= 0.9724863151829444
Test Score= 0.9619706136560069

K= 17
Train Score= 0.9726303658887928
Test Score= 0.9602420051858254

K= 19
Train Score= 0.9724863151829444
Test Score= 0.9619706136560069

K= 21
Train Score= 0.971766061653702
Test Score= 0.9636992221261884

K= 23
Train Score= 0.9721982137712475
Test Score= 0.9589455488331893

K= 25
Train Score= 0.9716220109478536
Test Score= 0.958513396715644

K= 27
Train Score= 0.9709017574186114
Test Score= 0.958513396715644

K= 29
Train Score= 0.9710458081244598
Test Sc

In [23]:
# Select new group of features
# model 2
X=df[['R_b','H_b','BA_RISP','H_RISP','HR_b','BB_b','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','R_p','ER','BB_p','HR_p','ERA','Pit_p','Str_p','Ctct','StL','FB','LD','GSc','IR','IS','RE24_p','E']]

In [27]:
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [28]:
# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)



In [29]:
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')

K= 1
Train Score= 1.0
Test Score= 0.9204840103716508

K= 3
Train Score= 0.975799481417459
Test Score= 0.9386343993085566

K= 5
Train Score= 0.9707577067127628
Test Score= 0.9377700950734659

K= 7
Train Score= 0.9721982137712475
Test Score= 0.9442523768366465

K= 9
Train Score= 0.9709017574186114
Test Score= 0.9481417458945549

K= 11
Train Score= 0.9711898588303083
Test Score= 0.9511668107173725

K= 13
Train Score= 0.9684528954191876
Test Score= 0.9572169403630078

K= 15
Train Score= 0.9696053010659752
Test Score= 0.9567847882454624

K= 17
Train Score= 0.9697493517718236
Test Score= 0.9576490924805532

K= 19
Train Score= 0.9690290982425814
Test Score= 0.9589455488331893

K= 21
Train Score= 0.9671564390665515
Test Score= 0.958513396715644

K= 23
Train Score= 0.9675885911840968
Test Score= 0.9624027657735523

K= 25
Train Score= 0.9670123883607029
Test Score= 0.9606741573033708

K= 27
Train Score= 0.9688850475367329
Test Score= 0.958513396715644

K= 29
Train Score= 0.9674445404782483
Test 

In [33]:
# Select new group of features
# model 3
X=df[['R_b','H_b','BA_RISP','H_RISP','HR_b','BB_b','BA','OBP','SLG','OPS','RE24_b','H_p','R_p','ER','BB_p','HR_p','ERA','Pit_p','Str_p','GSc','IR','IS','RE24_p','E']]

In [34]:
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [35]:
# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)



In [36]:
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')

K= 1
Train Score= 1.0
Test Score= 0.9416594641313742

K= 3
Train Score= 0.9772399884759435
Test Score= 0.956352636127917

K= 5
Train Score= 0.9719101123595506
Test Score= 0.9559204840103717

K= 7
Train Score= 0.9742149236531259
Test Score= 0.9593777009507347

K= 9
Train Score= 0.9740708729472775
Test Score= 0.9606741573033708

K= 11
Train Score= 0.9749351771823682
Test Score= 0.9611063094209161

K= 13
Train Score= 0.9737827715355806
Test Score= 0.9606741573033708

K= 15
Train Score= 0.973638720829732
Test Score= 0.9606741573033708

K= 17
Train Score= 0.972054163065399
Test Score= 0.9641313742437337

K= 19
Train Score= 0.9723422644770959
Test Score= 0.9654278305963699

K= 21
Train Score= 0.971766061653702
Test Score= 0.9658599827139153

K= 23
Train Score= 0.9716220109478536
Test Score= 0.9662921348314607

K= 25
Train Score= 0.9719101123595506
Test Score= 0.9654278305963699

K= 27
Train Score= 0.9697493517718236
Test Score= 0.966724286949006

K= 29
Train Score= 0.9703255545952175
Test Sc

In [43]:
# Select new group of features
# model 4
X=df[['R_b','H_b','BA_RISP','H_RISP','HR_b','BB_b','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','R_p','ER','BB_p','HR_p','ERA','Pit_p','Str_p','GSc','IR','IS','RE24_p','E']]

In [44]:
# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [45]:
# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)



In [46]:
for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')

K= 1
Train Score= 1.0
Test Score= 0.9446845289541919

K= 3
Train Score= 0.9794007490636704
Test Score= 0.9490060501296457

K= 5
Train Score= 0.9729184673004898
Test Score= 0.9533275713050994

K= 7
Train Score= 0.9753673292999135
Test Score= 0.9602420051858254

K= 9
Train Score= 0.9733506194180351
Test Score= 0.9636992221261884

K= 11
Train Score= 0.9737827715355806
Test Score= 0.9632670700086431

K= 13
Train Score= 0.9734946701238836
Test Score= 0.9628349178910977

K= 15
Train Score= 0.973638720829732
Test Score= 0.9619706136560069

K= 17
Train Score= 0.9726303658887928
Test Score= 0.9606741573033708

K= 19
Train Score= 0.9727744165946414
Test Score= 0.9641313742437337

K= 21
Train Score= 0.972054163065399
Test Score= 0.9636992221261884

K= 23
Train Score= 0.9716220109478536
Test Score= 0.9636992221261884

K= 25
Train Score= 0.9714779602420052
Test Score= 0.9649956784788245

K= 27
Train Score= 0.971766061653702
Test Score= 0.9671564390665515

K= 29
Train Score= 0.9710458081244598
Test 

In [48]:
# Select new group of features
# model 5
X=df[['R_b','H_b','BA_RISP','H_RISP','HR_b','BB_b','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','BB_p','HR_p','ERA','Pit_p','Str_p','GSc','IR','IS','RE24_p','E']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')




K= 1
Train Score= 1.0
Test Score= 0.9243733794295592

K= 3
Train Score= 0.9733506194180351
Test Score= 0.9343128781331028

K= 5
Train Score= 0.9670123883607029
Test Score= 0.939066551426102

K= 7
Train Score= 0.9675885911840968
Test Score= 0.9464131374243734

K= 9
Train Score= 0.9655718813022184
Test Score= 0.9524632670700086

K= 11
Train Score= 0.9662921348314607
Test Score= 0.9533275713050994

K= 13
Train Score= 0.9654278305963699
Test Score= 0.9537597234226448

K= 15
Train Score= 0.9649956784788245
Test Score= 0.9498703543647364

K= 17
Train Score= 0.9622587150677039
Test Score= 0.9503025064822818

K= 19
Train Score= 0.96168251224431
Test Score= 0.9515989628349178

K= 21
Train Score= 0.9622587150677039
Test Score= 0.9507346585998271

K= 23
Train Score= 0.9619706136560069
Test Score= 0.9507346585998271

K= 25
Train Score= 0.9611063094209161
Test Score= 0.9515989628349178

K= 27
Train Score= 0.9613944108326131
Test Score= 0.9537597234226448

K= 29
Train Score= 0.9619706136560069
Test 

In [50]:
# Select new group of features
# model 6
X=df[['R_b','BA_RISP','H_RISP','HR_b','BB_b','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','BB_p','HR_p','ERA','Pit_p','Str_p','GSc','IR','IS','RE24_p','E']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')




K= 1
Train Score= 1.0
Test Score= 0.9256698357821953

K= 3
Train Score= 0.9733506194180351
Test Score= 0.9369057908383751

K= 5
Train Score= 0.9694612503601268
Test Score= 0.9420916162489196

K= 7
Train Score= 0.9684528954191876
Test Score= 0.9477095937770095

K= 9
Train Score= 0.9649956784788245
Test Score= 0.9515989628349178

K= 11
Train Score= 0.9664361855373091
Test Score= 0.952895419187554

K= 13
Train Score= 0.9664361855373091
Test Score= 0.9541918755401901

K= 15
Train Score= 0.9645635263612792
Test Score= 0.9515989628349178

K= 17
Train Score= 0.9638432728320369
Test Score= 0.9507346585998271

K= 19
Train Score= 0.9632670700086431
Test Score= 0.9511668107173725

K= 21
Train Score= 0.96355517142034
Test Score= 0.9520311149524633

K= 23
Train Score= 0.9644194756554307
Test Score= 0.9546240276577356

K= 25
Train Score= 0.9621146643618553
Test Score= 0.9524632670700086

K= 27
Train Score= 0.9644194756554307
Test Score= 0.9546240276577356

K= 29
Train Score= 0.9632670700086431
Test 

In [52]:
# Select new group of features
# model 7
X=df[['BA_RISP','H_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','BB_p','HR_p','ERA','Pit_p','Str_p','GSc','IR','IS','RE24_p','E']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')




K= 1
Train Score= 1.0
Test Score= 0.9291270527225584

K= 3
Train Score= 0.973638720829732
Test Score= 0.9373379429559204

K= 5
Train Score= 0.9683088447133391
Test Score= 0.9442523768366465

K= 7
Train Score= 0.9678766925957937
Test Score= 0.9442523768366465

K= 9
Train Score= 0.9652837798905215
Test Score= 0.9554883318928262

K= 11
Train Score= 0.9662921348314607
Test Score= 0.956352636127917

K= 13
Train Score= 0.9651397291846731
Test Score= 0.9515989628349178

K= 15
Train Score= 0.96355517142034
Test Score= 0.9498703543647364

K= 17
Train Score= 0.9618265629501584
Test Score= 0.9537597234226448

K= 19
Train Score= 0.9615384615384616
Test Score= 0.9511668107173725

K= 21
Train Score= 0.9631230193027945
Test Score= 0.9511668107173725

K= 23
Train Score= 0.9639873235378853
Test Score= 0.9515989628349178

K= 25
Train Score= 0.9638432728320369
Test Score= 0.9541918755401901

K= 27
Train Score= 0.9624027657735523
Test Score= 0.9520311149524633

K= 29
Train Score= 0.9625468164794008
Test S

In [55]:
# Select new group of features
# model 8
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','RE24_p','E']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')




K= 1
Train Score= 1.0
Test Score= 0.9334485738980121

K= 3
Train Score= 0.9750792278882167
Test Score= 0.9369057908383751

K= 5
Train Score= 0.973638720829732
Test Score= 0.945980985306828

K= 7
Train Score= 0.9713339095361567
Test Score= 0.9498703543647364

K= 9
Train Score= 0.9701815038893691
Test Score= 0.9524632670700086

K= 11
Train Score= 0.9687409968308844
Test Score= 0.9546240276577356

K= 13
Train Score= 0.9674445404782483
Test Score= 0.9537597234226448

K= 15
Train Score= 0.9670123883607029
Test Score= 0.9550561797752809

K= 17
Train Score= 0.9655718813022184
Test Score= 0.9541918755401901

K= 19
Train Score= 0.9665802362431576
Test Score= 0.9554883318928262

K= 21
Train Score= 0.966724286949006
Test Score= 0.9550561797752809

K= 23
Train Score= 0.9660040334197637
Test Score= 0.9546240276577356

K= 25
Train Score= 0.9655718813022184
Test Score= 0.9546240276577356

K= 27
Train Score= 0.9657159320080668
Test Score= 0.956352636127917

K= 29
Train Score= 0.9651397291846731
Test S

In [57]:
# Select new group of features
# model 9
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','RE24_b']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')




K= 1
Train Score= 1.0
Test Score= 0.7026793431287813

K= 3
Train Score= 0.8614232209737828
Test Score= 0.742437337942956

K= 5
Train Score= 0.8287237107461827
Test Score= 0.7515125324114088

K= 7
Train Score= 0.8199366176894267
Test Score= 0.7584269662921348

K= 9
Train Score= 0.8131662345145492
Test Score= 0.766637856525497

K= 11
Train Score= 0.8063958513396715
Test Score= 0.7683664649956785

K= 13
Train Score= 0.8014981273408239
Test Score= 0.7631806395851339

K= 15
Train Score= 0.8002016709881878
Test Score= 0.7657735522904062

K= 17
Train Score= 0.796168251224431
Test Score= 0.7722558340535869

K= 19
Train Score= 0.7917026793431288
Test Score= 0.7679343128781331

K= 21
Train Score= 0.792711034284068
Test Score= 0.7687986171132238

K= 23
Train Score= 0.7915586286372803
Test Score= 0.7683664649956785

K= 25
Train Score= 0.7917026793431288
Test Score= 0.773552290406223

K= 27
Train Score= 0.7941515413425526
Test Score= 0.7744165946413137

K= 29
Train Score= 0.7901181215787957
Test Sc

In [58]:
# Select new group of features
# model 10
X=df[['H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','RE24_p','E']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

for k in k_values:
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled,y_train)
    y_pred=neigh.predict(X_test_scaled)
    train_score=neigh.score(X_train_scaled,y_train)
    test_score=neigh.score(X_test_scaled,y_test)
    print('K= '+str(k))
    print('Train Score= '+str(train_score))
    print('Test Score= '+str(test_score))
    print('')




K= 1
Train Score= 1.0
Test Score= 0.7234226447709594

K= 3
Train Score= 0.8537885335638145
Test Score= 0.7497839239412273

K= 5
Train Score= 0.8239700374531835
Test Score= 0.7532411408815903

K= 7
Train Score= 0.8131662345145492
Test Score= 0.7571305099394987

K= 9
Train Score= 0.8066839527513685
Test Score= 0.7588591184096802

K= 11
Train Score= 0.7997695188706425
Test Score= 0.7592912705272256

K= 13
Train Score= 0.795880149812734
Test Score= 0.7657735522904062

K= 15
Train Score= 0.7931431864016134
Test Score= 0.7675021607605877

K= 17
Train Score= 0.7919907807548257
Test Score= 0.7670700086430423

K= 19
Train Score= 0.7931431864016134
Test Score= 0.7687986171132238

K= 21
Train Score= 0.7905502736963411
Test Score= 0.7726879861711322

K= 23
Train Score= 0.791126476519735
Test Score= 0.7709593777009507

K= 25
Train Score= 0.7860847018150389
Test Score= 0.7731201382886776

K= 27
Train Score= 0.7868049553442812
Test Score= 0.7765773552290406

K= 29
Train Score= 0.7842120426390089
Test

In [60]:
# Selecting Model 8
# K = 41

# Select new group of features
# model 8
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','RE24_b','H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','RE24_p','E']]

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

k=41
neigh=KNeighborsClassifier(n_neighbors=k)
neigh.fit(X_train_scaled,y_train)
y_pred=neigh.predict(X_test_scaled)
train_score=neigh.score(X_train_scaled,y_train)
test_score=neigh.score(X_test_scaled,y_test)
print('K= '+str(k))
print('Train Score= '+str(train_score))
print('Test Score= '+str(test_score))
print('')




K= 41
Train Score= 0.9631230193027945
Test Score= 0.9589455488331893



In [108]:
stat_list=[]
for i in X.columns:
    stat_list.append(df['%s' % i].mean())

In [109]:
stat_list=np.array(stat_list)

In [110]:
stat_list.reshape(-1,1)

array([[  0.19774894],
       [  0.65605096],
       [  3.22929936],
       [  0.22436943],
       [  0.27782803],
       [  0.33498726],
       [  0.61289809],
       [137.9044586 ],
       [ -0.80955414],
       [  8.23566879],
       [  3.14012739],
       [  1.01910828],
       [  4.09280255],
       [143.89808917],
       [ 48.65605096],
       [  1.17834395],
       [  0.33757962],
       [ -0.25477707],
       [  0.57324841]])

In [111]:
stat_list=Fit.transform(stat_list.reshape(1,-1))

In [112]:
y_pred=neigh.predict(stat_list)

In [113]:
y_pred

array([0])

In [105]:
df=df[df.Team=='Detroit Tigers']

In [106]:
df=df[df.Date>'2021-12-31']

In [107]:
df.HR_b.mean()

0.6560509554140127

In [133]:
# make prediction by team and year

df=pd.concat(all_years)
df=df[df.Team=='Chicago White Sox']
df=df[df.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
y_pred=neigh.predict(stat_list)
y_pred

array([1])

In [134]:
X.columns

Index(['BA_RISP', 'HR_b', 'RBI', 'BA', 'OBP', 'SLG', 'OPS', 'Pit_b', 'RE24_b',
       'H_p', 'BB_p', 'HR_p', 'ERA', 'Pit_p', 'GSc', 'IR', 'IS', 'RE24_p',
       'E'],
      dtype='object')