In [None]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [None]:
START = 2000
END = 2024

In [None]:
batting = batting_stats(START, END, qual=150)

In [None]:
batting.to_csv("batting.csv")

In [None]:
## removing players that only have one qualified season may switch to MLBAM id
## at least two seasons of data
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] >1)

In [None]:
batting

In [None]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player
batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [None]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

In [None]:
null_count = batting.isnull().sum()

In [None]:
null_count

In [None]:
complete_cols = list(batting.columns[null_count ==0])

In [None]:
complete_cols

In [None]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [None]:
batting

In [None]:
batting.dtypes

In [None]:
batting.dtypes[batting.dtypes =="object"]

In [None]:
batting['Age Rng']

In [None]:
del batting["Age Rng"]

In [None]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [None]:
batting_full = batting.copy()
## avoiding setting with copy error (makes a new copy)
batting = batting.dropna().copy()

In [None]:
## feature selector 
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

## lamda is reserved in python so its alpha
rr = Ridge(alpha=1)
split = TimeSeriesSplit(n_splits=3)
## Will keep going untill it gets 40
sfs= SequentialFeatureSelector(rr, n_features_to_select=40, direction="forward", cv=split, n_jobs=4)

In [None]:
removed_columns =["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [None]:
## forcing ratios between 0 and 1 to avoid problemns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [None]:
batting


In [None]:
## The data has been scaled between 0 and 1 now making a fit for the data, it will pick the 40 predictors 
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [197]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years) - step, step):
        current_year = years[i]
        next_year = current_year + step  # Predict the next season following the current year
                    
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        next_season_data = data[data["Season"] == next_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        combined["Prediction_Year"] = next_year

        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

    
    
    

In [218]:
predictions = backtest(batting, rr, predictors)


In [219]:
predictions.shape

(5412, 3)

In [200]:
predictions

Unnamed: 0,actual,prediction,Prediction_Year
5200,0.1,0.425564,2006
3199,1.5,2.244963,2006
6021,0.7,0.909899,2006
820,3.4,2.648500,2006
4520,1.0,1.735574,2006
...,...,...,...
7382,0.9,1.404527,2022
9091,0.7,0.810776,2022
6077,-0.5,1.005102,2022
6498,-1.3,0.414302,2022


In [200]:
predictions

Unnamed: 0,actual,prediction,Prediction_Year
5200,0.1,0.425564,2006
3199,1.5,2.244963,2006
6021,0.7,0.909899,2006
820,3.4,2.648500,2006
4520,1.0,1.735574,2006
...,...,...,...
7382,0.9,1.404527,2022
9091,0.7,0.810776,2022
6077,-0.5,1.005102,2022
6498,-1.3,0.414302,2022


In [201]:
## time to find out if it's any good
from sklearn.metrics import mean_squared_error
## Will take the difference between the acutal next seasons war and what we predicted
## square that and take the average of those squares
mean_squared_error(predictions["actual"], predictions["prediction"])

2.738835421126018

In [202]:
batting["Next_WAR"].describe()

count    7376.000000
mean        1.620214
std         1.977068
min        -3.400000
25%         0.200000
50%         1.200000
75%         2.700000
max        12.700000
Name: Next_WAR, dtype: float64

In [203]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    ## Giving the model information on player and WAR
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    ## filling missing data with 1 to 1
    df["war_corr"].fillna(0, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    ## better than filling it in with a 0? 
    df["war_diff"].fillna(1, inplace=True)
    ## removing divide by zero errors
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)



In [204]:
def group_averages(df):
    return df["WAR"] /df["WAR"].mean()

In [205]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [206]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [207]:
predictions = backtest(batting, rr, new_predictors)

In [208]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.6529216649379332

In [209]:
predictions

Unnamed: 0,actual,prediction,Prediction_Year
5200,0.1,0.582392,2006
3199,1.5,2.325475,2006
6021,0.7,1.224592,2006
820,3.4,2.659142,2006
4520,1.0,1.980723,2006
...,...,...,...
7382,0.9,1.333965,2022
9091,0.7,0.604278,2022
6077,-0.5,0.833374,2022
6498,-1.3,0.384970,2022


In [210]:
pd.Series(rr.coef_, index=new_predictors).sort_values()


Age             -2.533951
SLG             -1.488283
WAR             -1.444341
RAR             -1.424518
G               -1.353970
BABIP           -1.095635
Def             -1.021356
Off             -0.962349
SB              -0.914946
Bat             -0.905551
OBP+            -0.781304
SH              -0.696207
H               -0.651075
war_diff        -0.623788
OPS             -0.381337
+WPA            -0.336817
BsR             -0.291577
Events          -0.193606
3B              -0.104676
war_corr        -0.060737
team_code        0.002907
player_season    0.009926
pLI              0.060443
Clutch           0.069801
-WPA             0.100624
PH               0.110810
AB               0.183946
BABIP+           0.194926
Pos              0.226412
Rep              0.350830
wOBA             0.369558
SF               0.372593
L-WAR            0.529489
Spd              0.705229
GDP              0.767731
BB/K             0.768814
PA               1.033198
REW              1.067276
ISO+        

In [211]:
diff = predictions["actual"] - predictions["prediction"]

In [212]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [215]:
ArmChairWarv1 = merged[["IDfg", "Name", "WAR", "Next_WAR", "Prediction_Year", "prediction"]].sort_values(["Prediction_Year"])

In [216]:
ArmChairWarv1


Unnamed: 0,IDfg,Name,WAR,Next_WAR,Prediction_Year,prediction
5200,2,Garret Anderson,0.198758,0.1,2006,0.582392
2019,1398,Brady Clark,0.385093,0.2,2006,1.908079
2477,1397,Jason Lane,0.279503,0.4,2006,1.371894
3462,1392,Mark DeRosa,0.254658,2.9,2006,0.430760
8505,1386,Alex Cora,0.254658,-0.1,2006,1.261107
...,...,...,...,...,...,...
6113,12552,Eugenio Suarez,0.204969,4.1,2022,0.888923
697,12546,C.J. Cron,0.347826,1.2,2022,1.099035
1273,12533,Marcus Semien,0.602484,4.0,2022,2.814657
7399,12510,Curt Casali,0.242236,0.4,2022,0.523192


In [217]:
ArmChairWarv1.to_csv('future.csv')