In [84]:
import pandas as pd
import numpy as np
from pybaseball import batting_stats

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

In [85]:
START = 2002
END = 2022

In [86]:
batting = batting_stats(START, END, qual = 200)
batting.to_csv("batting.csv")

In [87]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [88]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
6,15640,2022,Aaron Judge,NYY,30,149,547,662,172,84,...,0.270,118.4,236.0,0.607,389,0.173,0.290,,,
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6874,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,,0.0,,0,0.166,0.252,,,
7033,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
6666,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,,0.0,,0,0.169,0.295,,,
6978,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,,0,0.130,0.187,,,


In [89]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [90]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5558,Alfredo Amezaga,2006,1.1,2.0
5003,Alfredo Amezaga,2007,2.0,1.2
5244,Alfredo Amezaga,2008,1.2,
1165,Garret Anderson,2002,3.7,5.1
865,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
5946,Owen Miller,2022,0.6,
4878,Andrew Vaughn,2021,-0.3,0.0
2718,Andrew Vaughn,2022,0.0,
6615,Ha-seong Kim,2021,0.5,3.3


In [91]:
null_count = batting.isnull().sum()

In [92]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
CSW%           0
xBA         6746
xSLG        6746
xwOBA       6746
Next_WAR    1176
Length: 320, dtype: int64

In [93]:
complete_cols = list(batting.columns[null_count == 0])
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'UBR',
 'Age Rng',
 'Off',
 'Lg',
 'wGDP',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+',
 'BB%+',
 'K

In [94]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [95]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR
5558,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,86,107,113,143,109,63,0,0.188,0.256,2.0
5003,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,92,101,112,109,113,75,0,0.175,0.227,1.2
5244,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,99,101,101,123,111,64,0,0.178,0.244,
1165,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,118,91,80,65,97,129,0,0.137,0.232,5.1
865,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,112,101,80,90,99,109,0,0.164,0.252,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5946,24655,2022,Owen Miller,CLE,25,124,409,456,101,70,...,92,111,98,131,101,81,330,0.188,0.266,
4878,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,87,104,116,84,99,110,321,0.185,0.285,0.0
2718,26197,2022,Andrew Vaughn,CHW,24,127,488,530,136,90,...,88,108,109,93,99,106,404,0.200,0.287,
6615,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,126,99,59,137,96,88,201,0.216,0.303,3.3


In [96]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Hard%+        int64
Events        int64
CStr%       float64
CSW%        float64
Next_WAR    float64
Length: 132, dtype: object

In [97]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [98]:
batting["Dol"]

5558      $5.5
5003     $11.2
5244      $7.2
1165     $14.6
865      $22.0
         ...  
5946      $5.1
4878    ($2.6)
2718      $0.3
6615      $3.9
4528     $26.3
Name: Dol, Length: 6746, dtype: object

In [99]:
del batting["Dol"]

In [100]:
del batting["Age Rng"]

In [101]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [102]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [103]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha = 1)

split = TimeSeriesSplit(n_splits = 3)

sfs = SequentialFeatureSelector(rr, n_features_to_select = 20, direction = "forward", cv = split, n_jobs = 4)

In [104]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [105]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [106]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR,team_code
count,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,...,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0
mean,5359.664093,2011.155296,0.36062,0.653001,0.478936,0.48124,0.366178,0.290625,0.39944,0.103481,...,0.457531,0.403209,0.410851,0.510998,0.478705,0.172861,0.498872,0.545798,1.791454,0.474226
std,5127.583904,5.608152,0.147484,0.255867,0.24238,0.262176,0.182514,0.138752,0.171712,0.10588,...,0.114032,0.131231,0.121099,0.130398,0.134035,0.273932,0.1372,0.120679,1.99274,0.305046
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.4,0.0
25%,1129.25,2006.0,0.269231,0.478632,0.276978,0.259516,0.211207,0.179245,0.258621,0.043478,...,0.382022,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.3,0.205882
50%,3531.0,2011.0,0.346154,0.709402,0.507194,0.508651,0.37069,0.287736,0.37931,0.086957,...,0.460674,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,1.5,0.470588
75%,9009.0,2016.0,0.461538,0.871795,0.688849,0.711073,0.508621,0.391509,0.517241,0.130435,...,0.52809,0.488722,0.483146,0.594203,0.564626,0.346828,0.591489,0.625551,2.9,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [107]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [108]:
predictors = list(selected_columns[sfs.get_support()])

In [109]:
def backtest(data, model, predictors, start = 5, step = 1):
    all_predictions = []

    years = sorted(data["Season"].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index = test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis = 1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [110]:
predictions = backtest(batting, rr, predictors)

In [111]:
predictions

Unnamed: 0,actual,prediction
5003,1.2,1.556063
1926,1.4,0.811510
3108,-0.1,0.627641
5794,0.6,0.888410
1103,4.8,2.222727
...,...,...
1916,1.8,2.751347
5875,1.0,1.909538
7024,0.6,1.544166
4878,0.0,1.799315


In [112]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.7820934401390667

In [113]:
batting["Next_WAR"].describe()

count    5570.000000
mean        1.791454
std         1.992740
min        -3.400000
25%         0.300000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [114]:
2.7820934401390667 ** 0.5

1.6679608628918925

In [115]:
def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace = True)

    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace = True)

    df["war_diff"][df["war_diff"] == np.inf] = 1

    return df

batting = batting.groupby("IDfg", group_keys = False).apply(player_history)

In [116]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [117]:
batting["war_season"] = batting.groupby("Season", group_keys = False).apply(group_averages)

In [118]:
new_predictiors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [119]:
predictors = backtest(batting, rr, new_predictiors)

In [120]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.7820934401390667

In [121]:
pd.Series(rr.coef_, index = new_predictiors).sort_values()

Age             -2.615190
WAR             -1.723439
BABIP           -1.695069
SLG+            -1.488778
Soft%+          -1.323723
BU              -0.934587
PH              -0.728663
SO              -0.698345
war_diff        -0.584686
wGDP            -0.420642
CB%             -0.292811
LD+%            -0.239131
Pull%+          -0.173336
war_corr        -0.126101
player_season   -0.007652
O-Contact%       0.263641
IFH%             0.362367
OBP+             0.484609
Oppo%            0.727917
Spd              0.740824
SB               1.024539
IBB              1.645553
Hard%+           2.361554
war_season       3.433124
dtype: float64

In [122]:
diff = predictions["actual"] - predictions["prediction"]

In [123]:
merged = predictions.merge(batting, left_index = True, right_index = True)

In [124]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [126]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
6486,1679,2015,Chase Utley,0.217391,1.0,0.000070
3222,5497,2015,Marwin Gonzalez,0.304348,1.0,0.000262
3285,13066,2018,Teoscar Hernandez,0.267081,1.3,0.001131
3952,1572,2009,Coco Crisp,0.291925,3.2,0.002774
4701,4616,2009,Russell Martin,0.403727,3.1,0.002981
...,...,...,...,...,...,...
3168,4810,2007,Brian McCann,0.304348,8.6,6.455557
3827,1875,2009,Josh Hamilton,0.291925,8.4,6.542358
873,9166,2010,Buster Posey,0.459627,10.1,6.818013
451,15640,2021,Aaron Judge,0.552795,10.8,7.120200
