In [62]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df1_train = pd.read_parquet('../data/final/2022_data.parquet')
df2_train = pd.read_parquet('../data/final/2023_data.parquet')
df_val = pd.read_parquet('../data/final/2024_data.parquet')
df_train = pd.concat([df1_train, df2_train], ignore_index=True)

In [3]:
categorical = ['date', 'dayofweek', 'away_league', 'home_league', 'park_id']
numerical = ['home_OPS_blend', 'home_FIP_blend', 'home_FPCT_blend', 'away_OPS_blend', 'away_FIP_blend', 'away_FPCT_blend']
target = ['home_won']

In [4]:
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(df_train[categorical+numerical].to_dict(orient='records'))
all_features = vec.get_feature_names_out()
X_val = vec.transform(df_val[categorical+numerical].to_dict(orient='records'))
y_train = df_train[target].values.ravel()
y_val = df_val[target].values.ravel()

In [5]:
rf = RandomForestClassifier(n_estimators=200, max_depth=3, min_samples_leaf=7, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
auc

0.5796628842577436

In [63]:
WIN_THRESHOLD = 0.51
LOSE_THRESHOLD = 0.49
DISCARD_VALUE = -1
WIN_VALUE = 1
LOSE_VALUE = 0

In [64]:
classified_results = np.where(
    y_pred > WIN_THRESHOLD,
    WIN_VALUE, 
    np.where(
        y_pred <= LOSE_THRESHOLD,
        LOSE_VALUE, 
        DISCARD_VALUE 
    )
)
classified_results

array([1, 1, 1, ..., 1, 1, 1], shape=(2429,))

In [65]:
conf_matrix = confusion_matrix(y_val[classified_results != -1], classified_results[classified_results != -1])
conf_matrix

array([[214, 748],
       [138, 943]])

In [66]:
f1 = f1_score(y_val[classified_results != -1], classified_results[classified_results != -1])
f1

0.6803751803751804

In [70]:
team_df = pd.read_parquet('../data/intermediate/team_season.parquet')
team_df[team_df['season']=='2024']

Unnamed: 0,team,game_number,OPS,FPCT,season
90,ANA,162,0.748721,0.983411,2024
91,ARI,162,0.867344,0.989332,2024
92,ATL,162,0.816675,0.988278,2024
93,BAL,162,0.846496,0.985822,2024
94,BOS,162,0.838079,0.980612,2024
95,CHA,162,0.685766,0.983903,2024
96,CHN,162,0.796006,0.986617,2024
97,CIN,162,0.780174,0.982718,2024
98,CLE,161,0.787078,0.984947,2024
99,COL,162,0.793331,0.986076,2024


In [79]:
pitcher_df = pd.read_parquet('../data/intermediate/pitcher_season.parquet')
pitcher_df[(pitcher_df['season']=='2024') & (pitcher_df['P_id']=='yamay001')]

Unnamed: 0,P_id,game_number,FIP,season
1514,yamay001,161,3.003226,2024


In [78]:
df_val[df_val['home_team']=='LAN']

Unnamed: 0,date,dayofweek,away_team,away_game_number,away_league,home_team,home_game_number,home_league,home_score,away_score,park_id,away_P_id,home_P_id,home_OPS_blend,home_FIP_blend,home_FPCT_blend,away_OPS_blend,away_FIP_blend,away_FPCT_blend,home_won
1,20240321,Thu,SDN,2,NL,LAN,2,NL,11,15,SEO01,musgj001,yamay001,0.871884,4.283379,0.988210,0.792875,3.891155,0.983787,0
4,20240328,Thu,SLN,1,NL,LAN,3,NL,7,1,LOS03,mikom001,glast001,0.894992,2.767667,0.985197,0.827524,4.072340,0.988514,1
16,20240329,Fri,SLN,2,NL,LAN,4,NL,6,3,LOS03,thomz002,millb005,0.905249,3.548901,0.984385,0.786007,3.792789,0.989701,1
27,20240330,Sat,SLN,3,NL,LAN,5,NL,5,6,LOS03,lynnl001,yamay001,0.915014,4.312121,0.985308,0.777320,5.037872,0.990149,0
42,20240331,Sun,SLN,4,NL,LAN,6,NL,5,4,LOS03,matzs001,stong001,0.910303,3.649132,0.984838,0.770537,3.572174,0.990559,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2313,20240921,Sat,COL,155,NL,LAN,155,NL,3,6,LOS03,quanc001,buehw001,0.877231,4.124390,0.984852,0.791026,5.017679,0.985969,0
2327,20240922,Sun,COL,156,NL,LAN,156,NL,6,5,LOS03,senza001,yamay001,0.876363,3.387597,0.984784,0.792121,4.127508,0.985898,1
2348,20240924,Tue,SDN,157,NL,LAN,157,NL,2,4,LOS03,kingm002,knacl001,0.878610,4.541304,0.984864,0.832724,3.215778,0.987259,0
2362,20240925,Wed,SDN,158,NL,LAN,158,NL,4,3,LOS03,ceasd001,flahj002,0.878018,3.663824,0.984943,0.832356,3.126801,0.987174,1
