In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [2]:
START = 2000
END = 2023

In [3]:
batting = batting_stats(START, END, qual=150)

In [4]:
batting.to_csv("batting.csv")

In [5]:
## removing players that only have one qualified season may switch to MLBAM id
## at least two seasons of data
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] >1)

In [6]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
2,1109,2001,Barry Bonds,SFG,36,153,476,664,156,49,...,,,,0,,,,,,12.5
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
19,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.3
30,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8939,763,2001,Peter Bergeron,MON,23,102,375,416,79,61,...,,,,0,,,,,,-2.4
9108,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-3.1
8496,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.170,0.295,,,,-2.9
9002,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


In [7]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player
batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [8]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
6965,Alfredo Amezaga,2006,1.1,2.0
6236,Alfredo Amezaga,2007,2.0,1.2
6552,Alfredo Amezaga,2008,1.2,
2638,Garret Anderson,2000,2.2,2.7
3493,Garret Anderson,2001,2.7,3.7
...,...,...,...,...
8159,Cal Ripken,2001,-0.5,
992,Quilvio Veras,2000,2.7,1.1
6170,Quilvio Veras,2001,1.1,
5997,Devon White,2000,-0.3,0.8


In [9]:
null_count = batting.isnull().sum()

In [10]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         8863
xSLG        8863
xwOBA       8863
L-WAR          0
Next_WAR    1487
Length: 321, dtype: int64

In [11]:
complete_cols = list(batting.columns[null_count ==0])

In [12]:
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'BsR',
 'Def',
 'wSB',
 'Age Rng',
 'Off',
 'Lg',
 'TTO%',
 'AVG+',
 'BB%+',
 'K%+',
 'OBP+',
 'SLG+',
 'ISO+',
 'BABIP+',
 'Events',
 'L-WAR']

In [13]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [14]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,Events,L-WAR,Next_WAR
6965,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,96,98,74,97,75,42,97,0,1.1,2.0
6236,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,96,88,71,95,82,58,96,0,2.0,1.2
6552,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,99,62,81,92,86,65,100,0,1.2,
2638,2,2000,Garret Anderson,ANA,28,159,647,681,185,107,...,104,37,82,88,117,139,93,0,2.2,2.7
3493,2,2001,Garret Anderson,ANA,29,161,672,704,194,125,...,108,46,86,94,111,117,102,0,2.7,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8159,1010978,2001,Cal Ripken,BAL,40,128,477,516,114,84,...,89,61,74,83,84,75,82,0,-0.5,
992,1013404,2000,Quilvio Veras,ATL,29,84,298,364,92,72,...,113,140,85,118,92,58,117,0,2.7,1.1
6170,1013404,2001,Quilvio Veras,ATL,30,71,258,295,65,46,...,94,91,103,97,81,61,101,0,1.1,
5997,1013862,2000,Devon White,LAD,37,47,158,168,42,32,...,97,54,110,88,86,69,102,0,-0.3,0.8


In [15]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
ISO+          int64
BABIP+        int64
Events        int64
L-WAR       float64
Next_WAR    float64
Length: 69, dtype: object

In [16]:
batting.dtypes[batting.dtypes =="object"]

Name       object
Team       object
Age Rng    object
dtype: object

In [17]:
batting['Age Rng']

6965    28 - 28
6236    29 - 29
6552    30 - 30
2638    28 - 28
3493    29 - 29
         ...   
8159    40 - 40
992     29 - 29
6170    30 - 30
5997    37 - 37
2458    38 - 38
Name: Age Rng, Length: 8863, dtype: object

In [19]:
del batting["Age Rng"]

In [20]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [21]:
batting_full = batting.copy()
## avoiding setting with copy error (makes a new copy)
batting = batting.dropna().copy()

In [22]:
## feature selector 
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

## lamda is reserved in python so its alpha
rr = Ridge(alpha=1)
split = TimeSeriesSplit(n_splits=3)
## Will keep going untill it gets 40
sfs = SequentialFeatureSelector(rr, n_features_to_select=40, direction="forward", cv=split, n_jobs=4)

In [23]:
removed_columns =["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [24]:
## forcing ratios between 0 and 1 to avoid problemns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [25]:
batting


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,Events,L-WAR,Next_WAR,team_code
6965,1,2006,Alfredo Amezaga,FLA,0.333333,0.757812,0.355818,0.363057,0.282787,0.294931,...,0.222494,0.268868,0.349206,0.182432,0.122977,0.483871,0.0,0.279503,2.0,0.352941
6236,1,2007,Alfredo Amezaga,FLA,0.370370,0.765625,0.467116,0.474522,0.356557,0.331797,...,0.198044,0.254717,0.333333,0.229730,0.174757,0.473118,0.0,0.335404,1.2,0.352941
2638,2,2000,Garret Anderson,ANA,0.333333,0.968750,0.883642,0.845541,0.684426,0.456221,...,0.073350,0.306604,0.277778,0.466216,0.436893,0.440860,0.0,0.347826,2.7,0.029412
3493,2,2001,Garret Anderson,ANA,0.370370,0.984375,0.925801,0.882166,0.721311,0.539171,...,0.095355,0.325472,0.325397,0.425676,0.365696,0.537634,0.0,0.378882,3.7,0.029412
1447,2,2002,Garret Anderson,ANA,0.407407,0.960938,0.868465,0.840764,0.725410,0.456221,...,0.112469,0.264151,0.373016,0.533784,0.456311,0.569892,0.0,0.440994,5.1,0.029412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4508,1009268,2000,James Mouton,MIL,0.444444,0.406250,0.060708,0.074841,0.077869,0.087558,...,0.354523,0.556604,0.396825,0.168919,0.161812,0.526882,0.0,0.236025,0.0,0.529412
3802,1009818,2000,Paul O'Neill,NYY,0.666667,0.835938,0.747049,0.761146,0.581967,0.497696,...,0.190709,0.353774,0.341270,0.317568,0.258900,0.516129,0.0,0.304348,0.6,0.647059
4067,1010978,2000,Cal Ripken,BAL,0.740741,0.375000,0.313659,0.300955,0.250000,0.184332,...,0.156479,0.250000,0.285714,0.364865,0.365696,0.311828,0.0,0.291925,-0.5,0.117647
992,1013404,2000,Quilvio Veras,ATL,0.370370,0.382812,0.295110,0.340764,0.303279,0.294931,...,0.325183,0.320755,0.515873,0.297297,0.174757,0.698925,0.0,0.378882,1.1,0.088235
