# MLB 2020 WAR Predictions

Using a download of all available batter statistics on [Fangraphs](https://www.fangraphs.com) from 2006-2019

In [1]:
import numpy as np
import pandas as pd

# Import Data
data = pd.read_csv('input/FanGraphsData.csv')

# Format some of the columns
for col in data.columns:
    if (data[col].dtype == object) & ('%' in col):
        data[col] = data[col].str.strip('%').astype('float64')
data['Dol'] = data['Dol'].str.strip('$()').astype('float64')

data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(5381, 305)

In [4]:
data.head()

Unnamed: 0,Season,Name,Team,Age,G,AB,PA,H,1B,2B,...,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,playerid
0,2008,Albert Pujols,Cardinals,28,148,524,641,187,106,44,...,92,105,193,117,94,81,92,75,154,1177
1,2006,Travis Hafner,Indians,29,129,454,564,140,66,31,...,88,110,280,102,105,89,67,85,146,1573
2,2007,David Ortiz,Red Sox,31,149,549,667,182,94,52,...,85,120,183,118,83,95,58,87,143,745
3,2006,Albert Pujols,Cardinals,26,143,535,634,177,94,33,...,87,120,204,111,100,82,70,94,125,1177
4,2008,Chipper Jones,Braves,36,128,439,534,160,113,24,...,99,92,163,95,108,97,66,102,116,97


Since we are going to predict a player's WAR based on previous seasons we need to get rid of the rookies.

In [5]:
value_counts = data['playerid'].value_counts()
to_remove = value_counts[value_counts == 1].index
data = data[~data.playerid.isin(to_remove)]

We will then create a column for the WAR from the next season. We need to fill this with the actual next season WAR for seasons prior to 2019 and leave it to be predicted for the 2019 season. This will be our label for the data

In [6]:
data = data.merge(data.assign(Season=lambda x: x.Season - 1),
                 on=['Season', 'playerid'],
                 suffixes = ['','_new'],
                 how='left')
data = data.rename(columns={'WAR_new': 'WAR_label'})
data = data.drop(data.filter(regex='_new').columns, axis=1)
test = data[data.Season == 2019].reset_index(drop=True)
train = data.dropna(subset=['WAR_label'])
print(train.shape)
print(test.shape)

(3881, 306)
(324, 306)


We will then drop the columns with > 30% nulls while imputing the rest of the nulls using the mean.

In [7]:
null_percent = train.isnull().sum()/train.shape[0]*100
cols_to_drop = np.array(null_percent[null_percent > 30].index)
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop,axis=1)
null_percent = train.isnull().sum()/train.shape[0]*100
null_cols = list(null_percent[null_percent > 0].index.values)

for i in null_cols:
    train[i] = train[i].replace(np.nan, train[i].mean())
    test[i] = test[i].replace(np.nan, train[i].mean())

We will save these as our train and test sets.

In [8]:
train.to_csv('input/Batter1Train.csv', index=False, header=True)
test.to_csv('input/Batter1Test.csv', index=False, header=True)

Now we will complete the same process but instead of using the previous year to predict with, we will use the previous 2 years of data.

In [9]:
data2 = data.copy()
data2 = data2.merge(data2.assign(Season=lambda x: x.Season + 1),
                 on=['Season', 'playerid'],
                 suffixes = ['','_lastyr'],
                 how='left')
column_list = list(data2)

for i in range(4,304):
    data2[column_list[i]+'_diff'] = data2[column_list[i]] - data2[column_list[i+305]]
    
data2 = data2.drop(data2.filter(regex='_lastyr').columns, axis=1)
data2 = data2.dropna(thresh=295)

test2 = data2[data2.Season == 2019].reset_index(drop=True)
train2 = data2.dropna(subset=['WAR_label'])
print(train2.shape)
print(test2.shape)

(2914, 606)
(298, 606)


In [10]:
null_percent = train2.isnull().sum()/train2.shape[0]*100
cols_to_drop = np.array(null_percent[null_percent > 30].index)
train2 = train2.drop(cols_to_drop, axis=1)
test2 = test2.drop(cols_to_drop,axis=1)
null_percent = train2.isnull().sum()/train2.shape[0]*100
null_cols = list(null_percent[null_percent > 0].index.values)

for i in null_cols:
    train2[i] = train2[i].replace(np.nan, train2[i].mean())
    test2[i] = test2[i].replace(np.nan, train2[i].mean())
    
train2.to_csv('input/Batter2Train.csv', index=False, header=True)
test2.to_csv('input/Batter2Test.csv', index=False, header=True)

## Pitchers

We will complete the same steps to prepare our pitcher data.

In [11]:
pitcher_data = pd.read_csv('input/FanGraphsPitcherData.csv')
for col in pitcher_data.columns:
    if (pitcher_data[col].dtype == object) & ('%' in col):
        pitcher_data[col] = pitcher_data[col].str.strip('%').astype('float64')
pitcher_data['Dollars'] = pitcher_data['Dollars'].str.strip('$()').astype('float64')
print(pitcher_data.shape)
pitcher_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


(4642, 322)


Unnamed: 0,Season,Name,Team,Age,W,L,ERA,G,GS,CG,...,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,playerid
0,2006,Dennys Reyes,Twins,29,5,0,0.89,66,0,0,...,160,55,108,101,110,86,114,115,62,444
1,2006,Jonathan Papelbon,Red Sox,25,4,2,0.92,59,0,0,...,86,124,37,84,87,142,88,113,76,5975
2,2008,Brad Ziegler,Athletics,28,3,0,1.06,47,0,0,...,148,51,65,104,98,96,86,109,91,7293
3,2006,Cla Meredith,Padres,23,5,1,1.07,45,0,0,...,157,42,130,104,118,70,148,103,72,7613
4,2008,Joe Nathan,Twins,33,1,2,1.33,68,0,0,...,108,90,94,95,125,77,72,95,127,1122


In [12]:
value_counts = pitcher_data['playerid'].value_counts()
to_remove = value_counts[value_counts == 1].index
pitcher_data = pitcher_data[~pitcher_data.playerid.isin(to_remove)]

pitcher_data = pitcher_data.merge(pitcher_data.assign(Season=lambda x: x.Season - 1),
                 on=['Season', 'playerid'],
                 suffixes = ['','_new'],
                 how='left')

pitcher_data = pitcher_data.rename(columns={'WAR_new': 'WAR_label'})
pitcher_data = pitcher_data.drop(pitcher_data.filter(regex='_new').columns, axis=1)

pitcher_test = pitcher_data[pitcher_data.Season == 2019].reset_index(drop=True)
pitcher_train = pitcher_data.dropna(subset=['WAR_label'])
print(pitcher_train.shape)
print(pitcher_test.shape)

(2898, 323)
(272, 323)


In [13]:
null_percent = pitcher_train.isnull().sum()/pitcher_train.shape[0]*100
cols_to_drop = np.array(null_percent[null_percent > 30].index)
pitcher_train = pitcher_train.drop(cols_to_drop, axis=1)
pitcher_test = pitcher_test.drop(cols_to_drop,axis=1)

null_percent = pitcher_train.isnull().sum()/pitcher_train.shape[0]*100
null_cols = list(null_percent[null_percent > 0].index.values)

for i in null_cols:
    pitcher_train[i] = pitcher_train[i].replace(np.nan, pitcher_train[i].mean())
    pitcher_test[i] = pitcher_test[i].replace(np.nan, pitcher_train[i].mean())
    
pitcher_train.to_csv('input/Pitcher1Train.csv', index=False, header=True)
pitcher_test.to_csv('input/Pitcher1Test.csv', index=False, header=True)

In [14]:
pitcher_data2 = pitcher_data.copy()
pitcher_data2 = pitcher_data2.merge(pitcher_data2.assign(Season=lambda x: x.Season + 1),
                 on=['Season', 'playerid'],
                 suffixes = ['','_lastyr'],
                 how='left')

column_list = list(pitcher_data2)

for i in range(4,321):
    pitcher_data2[column_list[i]+'_diff'] = pitcher_data2[column_list[i]] - pitcher_data2[column_list[i+322]]
    
pitcher_data2 = pitcher_data2.drop(pitcher_data2.filter(regex='_lastyr').columns, axis=1)
pitcher_data2 = pitcher_data2.dropna(thresh=295)

pitcher_test2 = pitcher_data2[pitcher_data2.Season == 2019].reset_index(drop=True)
pitcher_train2 = pitcher_data2.dropna(subset=['WAR_label'])
print(pitcher_train2.shape)
print(pitcher_test2.shape)

(1904, 640)
(223, 640)


In [15]:
null_percent = pitcher_train2.isnull().sum()/pitcher_train2.shape[0]*100
cols_to_drop = np.array(null_percent[null_percent > 30].index)
pitcher_train2 = pitcher_train2.drop(cols_to_drop, axis=1)
pitcher_test2 = pitcher_test2.drop(cols_to_drop,axis=1)

null_percent = pitcher_train2.isnull().sum()/pitcher_train2.shape[0]*100
null_cols = list(null_percent[null_percent > 0].index.values)

for i in null_cols:
    pitcher_train2[i] = pitcher_train2[i].replace(np.nan, pitcher_train2[i].mean())
    pitcher_test2[i] = pitcher_test2[i].replace(np.nan, pitcher_train2[i].mean())
    
pitcher_train2.to_csv('input/Pitcher2Train.csv', index=False, header=True)
pitcher_test2.to_csv('input/Pitcher2Test.csv', index=False, header=True)

# Modeling

We will begin our modeling by exploring which is the best model from each category of models (Linear, Tree, Support Vector Machine, Nearest Neighbor, Neural Network, and Boosting).

## Data Prep

In [2]:
import pickle
import operator
import time
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import Lasso, LinearRegression, Ridge, ElasticNet, LassoLars, BayesianRidge, SGDRegressor, PassiveAggressiveRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, RandomForestRegressor
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
seed=1215

In [17]:
# Import the data files
player = ['Batter', 'Pitcher']
partition = ['Train', 'Test']

for k in player:
    for j in range(1,3):
        for i in partition:
            j=str(j)
            exec(k+j+'_'+i+' = pd.read_csv("input/'+k+j+i+'.csv")')
        exec(k+j+'_Target = '+k+j+'_Train["WAR_label"]')
        exec(k+j+"_Train = "+k+j+"_Train.drop(['Name', 'WAR_label', 'playerid', 'Season', 'Team'], axis=1)")
        exec("predictions_"+k[0].lower()+j+" = "+k+j+"_Test[['Name']].copy()")
        exec(k+j+"_Test = "+k+j+"_Test.drop(['Name', 'WAR_label', 'playerid', 'Season', 'Team'], axis=1)")

In [18]:
# Scale the features
scaler = StandardScaler()
for i in player:
    for j in range(1,3):
        j=str(j)
        exec(i+j+'_Train_scaled = scaler.fit_transform('+i+j+'_Train)')
        exec(i+j+'_Test_scaled = scaler.transform('+i+j+'_Test)')

In [19]:
# From https://github.com/codiply/blog-ipython-notebooks/blob/master/scikit-learn-estimator-selection-helper.ipynb
# An easy way to compare grid searched models

class EstimatorSelectionHelper:
    
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')
    
    def score_summary(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)
        
        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df

In [20]:
# Grid of parameters for each of the models

params = { 
    'Linear Regression': {},
    'Lasso': {'alpha': np.logspace(-3,2,23)},
    'LassoLars': {'alpha': np.logspace(-3,2,23)},
    'Ridge': {'alpha': np.logspace(-3,2,11), 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']},
    'ElasticNet': {'alpha': np.logspace(-3,2,11), 'l1_ratio': np.linspace(0.05,0.9,18)},
    'BayesianRidge': {'alpha_1': np.logspace(-8,-4,5), 'alpha_2': np.logspace(-8,-4,5), 'lambda_1': np.logspace(-8,-4,5), 
                      'lambda_2': np.logspace(-8,-4,5)},
    'SGDRegressor': {'loss': ['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'], 
                     'penalty': ['l2','l1','elasticnet'], 'alpha': np.logspace(-3,-1,3),
                     'l1_ratio': np.linspace(0.15,0.85,3), 'learning_rate': ['invscaling','adaptive']},
    'PassiveAggressiveRegressor': {'C': np.logspace(-6,2,17)},
    'SVR': [{'kernel': ['rbf','sigmoid'], 'C': np.logspace(-3,4,8)}, 
            {'kernel': ['poly'], 'degree': range(2,6), 'C': np.logspace(-3,4,8)}],
    'KNeighborsRegressor': {'n_neighbors': range(3,20,1), 'weights': ['uniform','distance'], 'p': [1,2]},
    'GradientBoostingRegressor': {'n_estimators': [800,1000,1200,1400], 'max_depth': [4,8,16,32],
                                  'max_features': ['auto','log2']},
    'AdaBoostRegressor': {'learning_rate': np.logspace(-3,0,5), 'n_estimators': [400,800,1000,1200,1400]},
    'ExtraTreesRegressor': {'n_estimators': [400,800,1000,1200,1400], 'max_depth': [4,8,16,32], 
                            'max_features': ['auto','log2']},
    'RandomForestRegressor': {'n_estimators': [800,1000,1200,1400], 'max_depth': [4,8,16,32], 'max_features': ['auto','log2']},
    'DecisionTreeRegressor': {'min_samples_split': np.linspace(0.001,0.02,6), 
                                    'max_depth': [*range(3,21),50,150], 'max_features': ['auto','log2']},
    'ExtraTreeRegressor': {'min_samples_split': np.linspace(0.001,0.02,6), 
                                    'max_depth': [*range(3,21),50,150], 'max_features': ['auto','log2']},
    'MLPRegressor': {'activation': ['logistic', 'relu', 'tanh'], 'alpha': np.logspace(-5,2,4),
    'hidden_layer_sizes': [(10,),(20,),(40,),(20,10),(40,10),(40,20),(40,20,10)]}
}

## Linear Models

In [21]:
LinearModels = { 
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(random_state=seed),
    'LassoLars': LassoLars(),
    'Ridge': Ridge(random_state=seed),
    'ElasticNet': ElasticNet(random_state=seed),
    'BayesianRidge': BayesianRidge(),
    'SGDRegressor': SGDRegressor(),
    'PassiveAggressiveRegressor': PassiveAggressiveRegressor(random_state=seed)
}

In [22]:
# Will select the best model for each dataset

LinearModelsGS = EstimatorSelectionHelper(LinearModels, params)

Lin_Mods={}
for i in player:
    for j in range(1,3):
        j=str(j)
        print("*** Evaluating %s%s ***" % (i, j))
        exec("LinearModelsGS.fit("+i+j+"_Train_scaled, "+i+j+"_Target, scoring='neg_mean_squared_error', n_jobs=-1, cv=6, verbose=0)")
        best_model = LinearModels[LinearModelsGS.score_summary().estimator[0]]
        best_params = LinearModelsGS.score_summary().params[0]
        Lin_Mods[i+j] = best_model.set_params(**best_params)

*** Evaluating Batter1 ***
Running GridSearchCV for Linear Regression.
Running GridSearchCV for Lasso.
Running GridSearchCV for LassoLars.
Running GridSearchCV for Ridge.
Running GridSearchCV for ElasticNet.
Running GridSearchCV for BayesianRidge.
Running GridSearchCV for SGDRegressor.
Running GridSearchCV for PassiveAggressiveRegressor.
Done.
*** Evaluating Batter2 ***
Running GridSearchCV for Linear Regression.
Running GridSearchCV for Lasso.
Running GridSearchCV for LassoLars.
Running GridSearchCV for Ridge.
Running GridSearchCV for ElasticNet.
Running GridSearchCV for BayesianRidge.
Running GridSearchCV for SGDRegressor.
Running GridSearchCV for PassiveAggressiveRegressor.
Done.
*** Evaluating Pitcher1 ***
Running GridSearchCV for Linear Regression.
Running GridSearchCV for Lasso.
Running GridSearchCV for LassoLars.
Running GridSearchCV for Ridge.
Running GridSearchCV for ElasticNet.
Running GridSearchCV for BayesianRidge.
Running GridSearchCV for SGDRegressor.
Running GridSearchCV

In [23]:
# Save and print the best linear models/parameters for each dataset

with open('lin.p', 'wb') as fp:
    pickle.dump(Lin_Mods, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
Lin_Mods

{'Batter1': ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.7,
            max_iter=1000, normalize=False, positive=False, precompute=False,
            random_state=1215, selection='cyclic', tol=0.0001, warm_start=False),
 'Batter2': Lasso(alpha=0.038986037025490715, copy_X=True, fit_intercept=True,
       max_iter=1000, normalize=False, positive=False, precompute=False,
       random_state=1215, selection='cyclic', tol=0.0001, warm_start=False),
 'Pitcher1': ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.7,
            max_iter=1000, normalize=False, positive=False, precompute=False,
            random_state=1215, selection='cyclic', tol=0.0001, warm_start=False),
 'Pitcher2': LassoLars(alpha=0.001, copy_X=True, eps=2.220446049250313e-16,
           fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
           positive=False, precompute='auto', verbose=False)}

## Tree Models

In [24]:
TreeModels = { 
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state=seed),
    'ExtraTreeRegressor': ExtraTreeRegressor(random_state=seed)
}

TreeModelsGS = EstimatorSelectionHelper(TreeModels, params)

Tree_Mods={}
for i in player:
    for j in range(1,3):
        j=str(j)
        print("*** Evaluating %s%s ***" % (i, j))
        exec("TreeModelsGS.fit("+i+j+"_Train_scaled, "+i+j+"_Target, scoring='neg_mean_squared_error', n_jobs=-1, cv=6, verbose=0)")
        best_model = TreeModels[TreeModelsGS.score_summary().estimator[0]]
        best_params = TreeModelsGS.score_summary().params[0]
        Tree_Mods[i+j] = best_model.set_params(**best_params)

*** Evaluating Batter1 ***
Running GridSearchCV for DecisionTreeRegressor.
Running GridSearchCV for ExtraTreeRegressor.
Done.
*** Evaluating Batter2 ***
Running GridSearchCV for DecisionTreeRegressor.
Running GridSearchCV for ExtraTreeRegressor.
Done.
*** Evaluating Pitcher1 ***
Running GridSearchCV for DecisionTreeRegressor.
Running GridSearchCV for ExtraTreeRegressor.
Done.
*** Evaluating Pitcher2 ***
Running GridSearchCV for DecisionTreeRegressor.
Running GridSearchCV for ExtraTreeRegressor.
Done.


In [25]:
# Save and print the best tree models/parameters for each dataset

with open('tree.p', 'wb') as fp:
    pickle.dump(Tree_Mods, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
Tree_Mods

{'Batter1': DecisionTreeRegressor(criterion='mse', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=0.001, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=1215, splitter='best'),
 'Batter2': DecisionTreeRegressor(criterion='mse', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=0.001, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=1215, splitter='best'),
 'Pitcher1': ExtraTreeRegressor(criterion='mse', max_depth=4, max_features='auto',
                    max_leaf_nodes=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_sample

## Support Vector Machine Models

In [26]:
SVRModels = { 
    'SVR': SVR()
}

SVRModelsGS = EstimatorSelectionHelper(SVRModels, params)

SVR_Mods={}
for i in player:
    for j in range(1,3):
        j=str(j)
        print("*** Evaluating %s%s ***" % (i, j))
        exec("SVRModelsGS.fit("+i+j+"_Train_scaled, "+i+j+"_Target, scoring='neg_mean_squared_error', n_jobs=-1, cv=6, verbose=0)")
        best_model = SVRModels[SVRModelsGS.score_summary().estimator[0]]
        best_params = SVRModelsGS.score_summary().params[0]
        SVR_Mods[i+j] = best_model.set_params(**best_params)

*** Evaluating Batter1 ***
Running GridSearchCV for SVR.
Done.
*** Evaluating Batter2 ***
Running GridSearchCV for SVR.
Done.
*** Evaluating Pitcher1 ***
Running GridSearchCV for SVR.
Done.
*** Evaluating Pitcher2 ***
Running GridSearchCV for SVR.
Done.


In [27]:
# Save and print the best tree models/parameters for each dataset

with open('svr.p', 'wb') as fp:
    pickle.dump(SVR_Mods, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
SVR_Mods

{'Batter1': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
     tol=0.001, verbose=False),
 'Batter2': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
     tol=0.001, verbose=False),
 'Pitcher1': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
     tol=0.001, verbose=False),
 'Pitcher2': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
     tol=0.001, verbose=False)}

## Nearest Neighbor Models

In [28]:
KNNModels = { 
    'KNeighborsRegressor': KNeighborsRegressor()
}

KNNModelsGS = EstimatorSelectionHelper(KNNModels, params)

KNN_Mods={}
for i in player:
    for j in range(1,3):
        j=str(j)
        print("*** Evaluating %s%s ***" % (i, j))
        exec("KNNModelsGS.fit("+i+j+"_Train_scaled, "+i+j+"_Target, scoring='neg_mean_squared_error', n_jobs=-1, cv=6, verbose=0)")
        best_model = KNNModels[KNNModelsGS.score_summary().estimator[0]]
        best_params = KNNModelsGS.score_summary().params[0]
        KNN_Mods[i+j] = best_model.set_params(**best_params)

*** Evaluating Batter1 ***
Running GridSearchCV for KNeighborsRegressor.
Done.
*** Evaluating Batter2 ***
Running GridSearchCV for KNeighborsRegressor.
Done.
*** Evaluating Pitcher1 ***
Running GridSearchCV for KNeighborsRegressor.
Done.
*** Evaluating Pitcher2 ***
Running GridSearchCV for KNeighborsRegressor.
Done.


In [29]:
# Save and print the best kNN models/parameters for each dataset

with open('knn.p', 'wb') as fp:
    pickle.dump(KNN_Mods, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
KNN_Mods

{'Batter1': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=16, p=1,
                     weights='distance'),
 'Batter2': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=16, p=1,
                     weights='distance'),
 'Pitcher1': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=16, p=1,
                     weights='distance'),
 'Pitcher2': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=16, p=1,
                     weights='distance')}

## Neural Network Models

In [30]:
NNModels = { 
    'MLPRegressor': MLPRegressor(random_state=seed)
}

NNModelsGS = EstimatorSelectionHelper(NNModels, params)

NN_Mods={}
for i in player:
    for j in range(1,3):
        j=str(j)
        print("*** Evaluating %s%s ***" % (i, j))
        exec("NNModelsGS.fit("+i+j+"_Train_scaled, "+i+j+"_Target, scoring='neg_mean_squared_error', n_jobs=-1, cv=6, verbose=0)")
        best_model = NNModels[NNModelsGS.score_summary().estimator[0]]
        best_params = NNModelsGS.score_summary().params[0]
        NN_Mods[i+j] = best_model.set_params(**best_params)

*** Evaluating Batter1 ***
Running GridSearchCV for MLPRegressor.
Done.
*** Evaluating Batter2 ***
Running GridSearchCV for MLPRegressor.
Done.
*** Evaluating Pitcher1 ***
Running GridSearchCV for MLPRegressor.




Done.
*** Evaluating Pitcher2 ***
Running GridSearchCV for MLPRegressor.
Done.


In [31]:
# Save and print the best Neural Network models/parameters for each dataset

with open('nn.p', 'wb') as fp:
    pickle.dump(NN_Mods, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
NN_Mods

{'Batter1': MLPRegressor(activation='relu', alpha=100.0, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1215, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False),
 'Batter2': MLPRegressor(activation='relu', alpha=100.0, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1215, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_

## Boosting Models

In [32]:
BoostModels = { 
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=seed),
    'AdaBoostRegressor': AdaBoostRegressor(random_state=seed),
    'ExtraTreesRegressor': ExtraTreesRegressor(random_state=seed),
    'RandomForestRegressor': RandomForestRegressor(random_state=seed)
}

BoostModelsGS = EstimatorSelectionHelper(BoostModels, params)

Boost_Mods={}
for i in player:
    for j in range(1,3):
        j=str(j)
        print("*** Evaluating %s%s ***" % (i, j))
        exec("BoostModelsGS.fit("+i+j+"_Train_scaled, "+i+j+"_Target, scoring='neg_mean_squared_error', n_jobs=-1, cv=6, verbose=1)")
        best_model = BoostModels[BoostModelsGS.score_summary().estimator[0]]
        best_params = BoostModelsGS.score_summary().params[0]
        Boost_Mods[i+j] = best_model.set_params(**best_params)

*** Evaluating Batter1 ***
Running GridSearchCV for GradientBoostingRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 43.7min finished


Running GridSearchCV for AdaBoostRegressor.
Fitting 6 folds for each of 25 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 50.4min finished


Running GridSearchCV for ExtraTreesRegressor.
Fitting 6 folds for each of 40 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 36.4min finished


Running GridSearchCV for RandomForestRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 88.7min finished


Done.
*** Evaluating Batter2 ***
Running GridSearchCV for GradientBoostingRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 58.4min finished


Running GridSearchCV for AdaBoostRegressor.
Fitting 6 folds for each of 25 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 85.9min finished


Running GridSearchCV for ExtraTreesRegressor.
Fitting 6 folds for each of 40 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 41.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 60.0min finished


Running GridSearchCV for RandomForestRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 135.2min finished


Done.
*** Evaluating Pitcher1 ***
Running GridSearchCV for GradientBoostingRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 17.6min finished


Running GridSearchCV for AdaBoostRegressor.
Fitting 6 folds for each of 25 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 29.6min finished


Running GridSearchCV for ExtraTreesRegressor.
Fitting 6 folds for each of 40 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 20.3min finished


Running GridSearchCV for RandomForestRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 50.3min finished


Done.
*** Evaluating Pitcher2 ***
Running GridSearchCV for GradientBoostingRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 20.0min finished


Running GridSearchCV for AdaBoostRegressor.
Fitting 6 folds for each of 25 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 46.0min finished


Running GridSearchCV for ExtraTreesRegressor.
Fitting 6 folds for each of 40 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 32.4min finished


Running GridSearchCV for RandomForestRegressor.
Fitting 6 folds for each of 32 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 74.6min finished


Done.


In [33]:
# Save and print the best Boosting models/parameters for each dataset

with open('boost.p', 'wb') as fp:
    pickle.dump(Boost_Mods, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
Boost_Mods

{'Batter1': ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=8,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
                     oob_score=False, random_state=1215, verbose=0,
                     warm_start=False),
 'Batter2': ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=8,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
                     oob_score=False, random_state=1215, verbose=0,
                     warm_start=False),
 'Pitcher1': ExtraTreesRegressor(bootstrap=False, criterion='mse',

## Stacking and Blending Models

In [24]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

seed = 1215

train = pd.read_csv('input/Batter1Train.csv')
test = pd.read_csv('input/Batter1Test.csv')

#Save the 'playerid' and 'Name' column
train_ID = train['playerid']
test_ID = test['playerid']
train_Name = train['Name']
test_Name = test['Name']

#Now drop the columns unnecessary for  the prediction process.
train.drop(['Name', 'playerid', 'Season', 'Team'], axis = 1, inplace = True)
test.drop(['Name', 'WAR_label', 'playerid', 'Season', 'Team'], axis = 1, inplace = True)

y_train = train.WAR_label.values
train.drop(['WAR_label'], axis=1, inplace=True)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(train, y_train, test_size=0.2, random_state=seed)

In [11]:
#Validation function
n_folds = 6

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [12]:
lasso = make_pipeline(StandardScaler(), Lasso(alpha=0.03, random_state=seed))

ENet = make_pipeline(StandardScaler(), ElasticNet(alpha=0.1, l1_ratio=0.3, random_state=seed))

Ridge = make_pipeline(StandardScaler(), Ridge(alpha=1000.0, solver='svd'))

GBoost = make_pipeline(StandardScaler(), GradientBoostingRegressor(n_estimators=1550, learning_rate=0.01, max_depth=3, 
                                                                   max_features='log2', min_samples_split=0.01, 
                                                                   random_state=seed))

model_xgb = make_pipeline(StandardScaler(), xgb.XGBRegressor(colsample_bytree=0.7207, gamma=0.9, learning_rate=0.004912, 
                                                             max_depth=3, n_estimators=1100, reg_alpha=0.745, reg_lambda=0.9815, 
                                                             subsample=0.5213, silent=1, random_state=seed, nthread=-1))

model_lgb = make_pipeline(StandardScaler(), lgb.LGBMRegressor(num_leaves=2, colsample_bytree=0.775, learning_rate=0.1, 
                                                              n_estimators=272, max_depth=3, random_state=seed))

In [13]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(Ridge)
print("Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))


Lasso score: 1.6518 (0.0614)

ElasticNet score: 1.6523 (0.0618)

Ridge score: 1.6606 (0.0602)

Gradient Boosting score: 1.6767 (0.0631)

Xgboost score: 1.6710 (0.0593)

LGBM score: 1.6742 (0.0624)



In [14]:
# From Kaggle
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [15]:
averaged_models = AveragingModels(models = (ENet, GBoost, Ridge, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 1.6518 (0.0614)



In [16]:
# From Kaggle
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=6):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=seed)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [17]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, Ridge, GBoost, lasso),
                                                 meta_model = model_xgb)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 1.6559 (0.0537)


In [18]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [19]:
stacked_averaged_models.fit(X_train.values, y_train)
stacked_val_pred = stacked_averaged_models.predict(X_val.values)
stacked_pred = stacked_averaged_models.predict(test.values)
print(rmsle(y_val, stacked_val_pred))

1.728524016128155


In [20]:
model_xgb.fit(X_train, y_train)
xgb_val_pred = model_xgb.predict(X_val)
xgb_pred = model_xgb.predict(test)
print(rmsle(y_val, xgb_val_pred))

1.7533468622447985


In [21]:
model_lgb.fit(X_train, y_train)
lgb_val_pred = model_lgb.predict(X_val)
lgb_pred = model_lgb.predict(test.values)
print(rmsle(y_val, lgb_val_pred))

1.7618312190085428


In [22]:
'''RMSE on the validation data when averaging'''

print('RMSLE score on validation data:')
print(rmsle(y_val,stacked_val_pred*0.90 + xgb_val_pred*0.06 + lgb_val_pred*0.04 ))

RMSLE score on validation data:
1.7292254943035485


In [25]:
b1 = pd.DataFrame()
b1['playerid'] = test_ID
b1['Name'] = test_Name
b1['WAR_pred'] = stacked_pred
b1.sort_values(by='WAR_pred', ascending=False).reset_index(drop=True).head(15)

Unnamed: 0,playerid,Name,WAR_pred
0,10155,Mike Trout,6.144684
1,18401,Ronald Acuna Jr.,5.932176
2,12861,Anthony Rendon,5.879208
3,12533,Marcus Semien,5.707011
4,13611,Mookie Betts,5.667087
5,15998,Cody Bellinger,5.40762
6,11477,Christian Yelich,5.368747
7,17678,Alex Bregman,5.231559
8,13613,Ketel Marte,4.93457
9,20123,Juan Soto,4.582609


Finally repeat this for the other 3 datasets...