In [1]:
## 17 - August - 2023

In [2]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [3]:
stats_DataFrame = pd.read_csv('Data/stats_DataFrame_updated.csv')

In [4]:
regression_instance = Ridge(alpha=1)
split_instance = TimeSeriesSplit(n_splits=3)
sequential_feature_selector_instance = SequentialFeatureSelector(regression_instance, n_features_to_select=20,
                                                                 direction='forward', cv=split_instance, n_jobs=2)

In [5]:
target_column = 'Next_season_WAR'
text_columns = ['Name', 'Team']
meta_data_columns = ['IDfg', 'Season']

keeper_columns = stats_DataFrame.columns[~stats_DataFrame.columns.isin([target_column] + text_columns + meta_data_columns)]

In [6]:
scaler_instance = MinMaxScaler()
stats_DataFrame.loc[:, keeper_columns] = scaler_instance.fit_transform(stats_DataFrame[keeper_columns])

In [7]:
sequential_feature_selector_instance.fit(stats_DataFrame[keeper_columns], stats_DataFrame['Next_season_WAR'])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=2)

In [8]:
optimal_features = list(keeper_columns[sequential_feature_selector_instance.get_support()])

In [9]:
def backtest(stats, model_instance, features, start=5, step=1):
    WAR_predictions = []
    seasons = sorted(stats['Season'].unique())
    for iteration in range(start,len(seasons),step):
        current_season = seasons[iteration]
        training_data = stats[stats['Season'] < current_season]
        testing_data = stats[stats['Season'] == current_season]
        
        model_instance.fit(training_data[features], training_data['Next_season_WAR'])
        next_season_WAR_predictions = model_instance.predict(testing_data[features])
        
        next_season_WAR_predictions = pd.Series(next_season_WAR_predictions, index=testing_data.index)
        actual_and_predicted_WAR = pd.concat([testing_data['Next_season_WAR'], next_season_WAR_predictions], axis=1)
        actual_and_predicted_WAR.columns = ['actual_next_season_WAR', 'predicted_next_season_WAR']
        WAR_predictions.append(actual_and_predicted_WAR)
    return pd.concat(WAR_predictions)

In [10]:
WAR_predictions_DataFrame = backtest(stats_DataFrame, regression_instance, optimal_features)

In [11]:
model_error = mean_squared_error(WAR_predictions_DataFrame['actual_next_season_WAR'],
                                 WAR_predictions_DataFrame['predicted_next_season_WAR']) ** 0.5

In [12]:
print(f'The error for our initial regression model is {model_error}')

The error for our initial regression model is 1.6408451058026605


In [13]:
def players_WAR_trend(players_group):
    players_group = players_group.sort_values('Season')
    players_group['Seasons_in_MLB'] = range(1, players_group.shape[0]+1)
    
    correlations = (players_group[['Seasons_in_MLB', 'WAR']].expanding()
                                                            .corr()
                                                            .loc[(slice(None), 'Seasons_in_MLB'), 'WAR'])
    players_group['WAR_and_season_correlation'] = list(correlations.fillna(1))
    
    players_group['WAR_movement'] = players_group['WAR'] / players_group['WAR'].shift(1)
    players_group['WAR_movement'] = players_group['WAR_movement'].fillna(1)
    
    return players_group

In [14]:
stats_DataFrame = stats_DataFrame.groupby('IDfg', group_keys=False).apply(players_WAR_trend)