In [14]:
# Logistic Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# SQL setup
from sqlalchemy import create_engine
# connect to a local database
engine = create_engine("sqlite:///boxscores.db")

In [4]:
# load in data
df_2020=pd.read_sql('2020',engine)
df_2021=pd.read_sql('2021',engine)
df_2022=pd.read_sql('2022',engine)

In [5]:
# concatenate them together
all_years=[df_2020,df_2021,df_2022]
df=pd.concat(all_years)

In [48]:
# Select new group of features
# model 9 batting only
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','RE24_b']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='New York Yankees'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['BA_RISP', 'HR_b', 'RBI', 'BA', 'OBP', 'SLG', 'OPS', 'Pit_b', 'RE24_b']

Train Acc Score= 0.789253817343705
Train Precision Score= 0.8022768124625524
Train Recall Score= 0.7693191611605861
[[2801  660]
 [ 803 2678]]

Test Acc Score= 0.7869490060501296
Test Precision Score= 0.8046198267564967
Test Recall Score= 0.7424511545293073
[[985 203]
 [290 836]]

Beta 0:
[0.17598176]
Betas:
[[0.12924300907510944, -0.22024423069326368, -0.30908311056811116, -0.5837297386062872, 0.5657877484488042, 0.03082926222459857, 0.38798622711668673, -0.4744143342403532, 2.3044768810071945]]

New York Yankees
Standard Scaled Stats:
[[-0.00514377  0.32788429  0.17362313 -0.02103893  0.10786945  0.11716728
   0.12196641  0.24334261  0.24850521]]

Winner
Probability of Winner: 0.6526965382939313




In [49]:
# Select new group of features

# model 10 pitching/defense only
X=df[['H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','RE24_p','E']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='New York Yankees'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['H_p', 'BB_p', 'HR_p', 'ERA', 'Pit_p', 'GSc', 'IR', 'IS', 'RE24_p', 'E']

Train Acc Score= 0.7963123019302795
Train Precision Score= 0.7813776204737272
Train Recall Score= 0.824475725366274
[[2658  803]
 [ 611 2870]]

Test Acc Score= 0.7895419187554019
Test Precision Score= 0.7660283097418817
Test Recall Score= 0.8170515097690941
[[907 281]
 [206 920]]

Beta 0:
[-0.20646666]
Betas:
[[-0.05068288295147053, -0.3939377293283889, 0.19627088985708327, 0.33073484147721505, 0.5633285995566644, -0.08991818199456131, 0.06930415609361179, 0.09399237625312407, 2.809894626421619, -0.034586771926667946]]

New York Yankees
Standard Scaled Stats:
[[-0.2473399  -0.22872232 -0.17077404 -0.274268   -0.12138151  0.30174553
   0.09882692 -0.00637259  0.24913402 -0.10921253]]

Winner
Probability of Winner: 0.595440184604127




In [50]:
# Select new group of features

# model 11 Batting + Pitching/Defense, no RE24_b and no RE24_p
X=df[['BA_RISP','HR_b','RBI','BA','OBP','SLG','OPS','Pit_b','H_p','BB_p','HR_p','ERA','Pit_p','GSc','IR','IS','E']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='New York Yankees'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['BA_RISP', 'HR_b', 'RBI', 'BA', 'OBP', 'SLG', 'OPS', 'Pit_b', 'H_p', 'BB_p', 'HR_p', 'ERA', 'Pit_p', 'GSc', 'IR', 'IS', 'E']

Train Acc Score= 0.968596946125036
Train Precision Score= 0.9668097281831187
Train Recall Score= 0.9706980752657283
[[3345  116]
 [ 102 3379]]

Test Acc Score= 0.9645635263612792
Test Precision Score= 0.9694244604316546
Test Recall Score= 0.9573712255772646
[[1154   34]
 [  48 1078]]

Beta 0:
[-0.38367089]
Betas:
[[0.4565701714163065, -0.3512968115946787, 6.598609119453835, -0.7545613757817118, 1.2654124974104042, 0.40489918513611867, 0.7399635710502476, -0.9719009536500544, -0.9070688064966833, -0.433747908723298, -0.22251070583076923, -6.563251871608499, 0.11436965109661268, 0.12005468119018635, 0.23143999119062197, -0.5561314229056061, -0.9934078919645746]]

New York Yankees
Standard Scaled Stats:
[[-0.00514377  0.32788429  0.17362313 -0.02103893  0.10786945  0.11716728
   0.12196641  0.24334261 -0.2473399  -0.22872232 -0.17077404 -0.274268
  -0.1



In [90]:
# Select new group of features
# model 12 RE24_b only
X=df[['RE24_b']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='New York Yankees'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['RE24_b']

Train Acc Score= 0.7814750792278882
Train Precision Score= 0.8003058103975536
Train Recall Score= 0.7517954610744039
[[2808  653]
 [ 864 2617]]

Test Acc Score= 0.7765773552290406
Test Precision Score= 0.7964946445959105
Test Recall Score= 0.7264653641207816
[[979 209]
 [308 818]]

Beta 0:
[0.20417052]
Betas:
[[2.0701515991788964]]

New York Yankees
Standard Scaled Stats:
[[0.24850521]]

Winner
Probability of Winner: 0.6723017310902992




In [98]:
# Select new group of features
# model 13 RE24_p only
X=df[['RE24_p']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='Detroit Tigers'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['RE24_p']

Train Acc Score= 0.7808988764044944
Train Precision Score= 0.763724434876211
Train Recall Score= 0.815282964665326
[[2583  878]
 [ 643 2838]]

Test Acc Score= 0.7865168539325843
Test Precision Score= 0.7594417077175698
Test Recall Score= 0.8214920071047958
[[895 293]
 [201 925]]

Beta 0:
[-0.2047495]
Betas:
[[2.1023097019014925]]

Detroit Tigers
Standard Scaled Stats:
[[-0.0955924]]

Loser
Probability of Winner: 0.39994018816627736




In [97]:
# Select new group of features
# model 14 RE24_b and RE24_p
X=df[['RE24_b','RE24_p']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='Detriot Tigers'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['RE24_b', 'RE24_p']

Train Acc Score= 1.0
Train Precision Score= 1.0
Train Recall Score= 1.0
[[3461    0]
 [   0 3481]]

Test Acc Score= 1.0
Test Precision Score= 1.0
Test Recall Score= 1.0
[[1188    0]
 [   0 1126]]

Beta 0:
[-0.02726511]
Betas:
[[9.523998308024082, 9.707784249276719]]

Detriot Tigers
Standard Scaled Stats:
[[nan nan]]





ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [117]:
# Select new group of features
# model 15 batting with no RE24
#X=df[['OPS']] #72
#X=df[['RBI']] #75
#X=df[['OPS','RBI']] #75
#X=df[['OPS','RBI','BA']] #75
X=df[['OPS','RBI','BA_RISP']] #75
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='Chicago White Sox'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['OPS', 'RBI', 'BA_RISP']

Train Acc Score= 0.757850763468741
Train Precision Score= 0.7661147250147842
Train Recall Score= 0.7443263430048837
[[2670  791]
 [ 890 2591]]

Test Acc Score= 0.7592912705272256
Test Precision Score= 0.7671361502347418
Test Recall Score= 0.7255772646536413
[[940 248]
 [309 817]]

Beta 0:
[0.12078849]
Betas:
[[0.4705303205986223, 1.1971528153791424, 0.2806182206417707]]

Chicago White Sox
Standard Scaled Stats:
[[-0.08877306 -0.06556793  0.09407451]]

Winner
Probability of Winner: 0.5067301586946198




In [137]:
# Select new group of features

# model 16 pitching/defense only no RE24_p
#X=df[['ERA','BB_p']] #75
X=df[['ERA','BB_p','E']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='New York Yankees'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['ERA', 'BB_p', 'E']

Train Acc Score= 0.7634687409968309
Train Precision Score= 0.7451346307651293
Train Recall Score= 0.8029301924734272
[[2505  956]
 [ 686 2795]]

Test Acc Score= 0.759723422644771
Test Precision Score= 0.7309562398703403
Test Recall Score= 0.8010657193605684
[[856 332]
 [224 902]]

Beta 0:
[-0.19777864]
Betas:
[[-1.8596814595212459, -0.09890385939944472, -0.28781945394690633]]

New York Yankees
Standard Scaled Stats:
[[-0.274268   -0.22872232 -0.10921253]]

Winner
Probability of Winner: 0.590571272496818




In [147]:
# Select new group of features

# model 17 = model 15 +16
#X=df[['ERA','BB_p']] #75
X=df[['OPS','RBI','BA_RISP','ERA','BB_p','E']]
y=df['Won']

print('Features: ')
print(X.columns.tolist())
print('')

# train test val split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scaling data

# creating object 
stand= StandardScaler()

# fit data
Fit= stand.fit(X_train)

# transform data
X_train_scaled = Fit.transform(X_train)
X_test_scaled = Fit.transform(X_test)
X_val_scaled = Fit.transform(X_val)

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_test=logreg.predict(X_test_scaled)
y_pred_train=logreg.predict(X_train_scaled)

train_acc_score=logreg.score(X_train_scaled, y_train)
train_prec_score=precision_score(y_train,y_pred_train)
train_recall_score=recall_score(y_train,y_pred_train)
confusion_matrix_train=confusion_matrix(y_train, y_pred_train)


test_acc_score=logreg.score(X_test_scaled, y_test)
test_prec_score=precision_score(y_test,y_pred_test)
test_recall_score=recall_score(y_test,y_pred_test)
confusion_matrix_test=confusion_matrix(y_test,y_pred_test)




print('Train Acc Score= '+str(train_acc_score))
print('Train Precision Score= '+str(train_prec_score))
print('Train Recall Score= '+str(train_recall_score))
print(confusion_matrix_train)
print('')
print('Test Acc Score= '+str(test_acc_score))
print('Test Precision Score= '+str(test_prec_score))
print('Test Recall Score= '+str(test_recall_score))
print(confusion_matrix_test)
print('')

print('Beta 0:')
print(logreg.intercept_)

print('Betas:')
print(logreg.coef_.tolist())
print('')



# make prediction by team and year average stats
team='Chicago White Sox'
print(team)
df_team=pd.concat(all_years)
df_team=df_team[df_team.Team==team]
df_team=df_team[df_team.Date>'2021-12-31']
stat_list=[]
for i in X.columns:
    stat_list.append(df_team['%s' % i].mean())
stat_list=np.array(stat_list)
stat_list=Fit.transform(stat_list.reshape(1,-1))
print('Standard Scaled Stats:')
print(stat_list)
print('')
y_pred=logreg.predict(stat_list)
if y_pred[0]==1:
    print('Winner')
else:
    print('Loser')


# calculate probability
beta_0=logreg.intercept_.tolist()[0]
beta_x_dot=np.dot(logreg.coef_.tolist()[0],stat_list.tolist()[0])
exponent = -(beta_0 + beta_x_dot)
e_exponent = np.exp(exponent)
prob = 1/(1+e_exponent)
print('Probability of Winner: '+str(prob))

Features: 
['OPS', 'RBI', 'BA_RISP', 'ERA', 'BB_p', 'E']

Train Acc Score= 0.9608182080092192
Train Precision Score= 0.9567321377739824
Train Recall Score= 0.965527147371445
[[3309  152]
 [ 120 3361]]

Test Acc Score= 0.9593777009507347
Test Precision Score= 0.954225352112676
Test Recall Score= 0.9626998223801065
[[1136   52]
 [  42 1084]]

Beta 0:
[-0.53687134]
Betas:
[[1.2456460176519921, 5.311655041033923, 0.6168193743502194, -6.983550801217074, -0.3455074522545595, -1.004439984637013]]

Chicago White Sox
Standard Scaled Stats:
[[-0.08877306 -0.06556793  0.09407451 -0.08336666  0.04546515  0.12513447]]

Loser
Probability of Winner: 0.37826905756656054




In [142]:
# model 17 validation

y_pred_val=logreg.predict(X_val_scaled)

val_acc_score=logreg.score(X_val_scaled,y_val)
val_prec_score=precision_score(y_val,y_pred_val)
val_recall_score=recall_score(y_val,y_pred_val)
confusion_matrix_val=confusion_matrix(y_val,y_pred_val)
    
    

print('Val Acc Score= '+str(val_acc_score))
print('Val Precision Score= '+str(val_prec_score))
print('Val Recall Score= '+str(val_recall_score))
print(confusion_matrix_val)

print('')

Val Acc Score= 0.9671564390665515
Val Precision Score= 0.962248322147651
Val Recall Score= 0.9736842105263158
[[1091   45]
 [  31 1147]]

