In [1]:
## 26 - July - 2023

In [2]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd

In [3]:
AL_DataFrame = pd.read_csv('../Data/AL_data.csv')
NL_DataFrame = pd.read_csv('../Data/NL_data.csv')

In [4]:
features = ['Age', 'W', 'L', 'W-L%', 'W-L%_undefined', 'ERA', 'G',
            'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB',
            'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9',
            'BB9', 'SO9', 'SO/W', 'Season', 'Team_W', 'Team_L', 'Team_W-L%', 'GB']

In [5]:
AL_training_data = AL_DataFrame[AL_DataFrame['Season'] < 2022]
NL_training_data = NL_DataFrame[NL_DataFrame['Season'] < 2022]

AL_testing_data = AL_DataFrame[AL_DataFrame['Season'] == 2022]
NL_testing_data = NL_DataFrame[NL_DataFrame['Season'] == 2022]

In [6]:
AL_regression_instance = Ridge(alpha=0.1)
NL_regression_instance = Ridge(alpha=0.1)

In [7]:
AL_regression_instance.fit(AL_training_data[features], AL_training_data['Share'])
NL_regression_instance.fit(NL_training_data[features], NL_training_data['Share'])

Ridge(alpha=0.1)

In [8]:
AL_cy_young_predictions = AL_regression_instance.predict(AL_testing_data[features])
NL_cy_young_predictions = NL_regression_instance.predict(NL_testing_data[features])

In [9]:
AL_cy_young_predictions = pd.DataFrame(AL_cy_young_predictions, columns=['Predictions'], index=AL_testing_data.index)
NL_cy_young_predictions = pd.DataFrame(NL_cy_young_predictions, columns=['Predictions'], index=NL_testing_data.index)

In [10]:
AL_cy_young_actual_and_predictions = pd.concat([AL_testing_data[['Name', 'Share']], AL_cy_young_predictions], axis=1)
NL_cy_young_actual_and_predictions = pd.concat([NL_testing_data[['Name', 'Share']], NL_cy_young_predictions], axis=1)

In [11]:
print('Mean Squared Error for the AL: ' + str(mean_squared_error(AL_cy_young_actual_and_predictions['Share'],
                                                                 AL_cy_young_actual_and_predictions['Predictions'])),
      'Mean Squared Error for the NL: ' + str(mean_squared_error(NL_cy_young_actual_and_predictions['Share'],
                                                                 NL_cy_young_actual_and_predictions['Predictions'])),
      sep='\n')

Mean Squared Error for the AL: 0.002674765484298228
Mean Squared Error for the NL: 0.001931364665880398


In [12]:
AL_cy_young_actual_and_predictions = AL_cy_young_actual_and_predictions.sort_values('Share', ascending=False)
NL_cy_young_actual_and_predictions = NL_cy_young_actual_and_predictions.sort_values('Share', ascending=False)

AL_cy_young_actual_and_predictions['Rank'] = list(range(1,AL_cy_young_actual_and_predictions.shape[0]+1))
NL_cy_young_actual_and_predictions['Rank'] = list(range(1,NL_cy_young_actual_and_predictions.shape[0]+1))

In [13]:
AL_cy_young_actual_and_predictions = AL_cy_young_actual_and_predictions.sort_values('Predictions', ascending=False)
NL_cy_young_actual_and_predictions = NL_cy_young_actual_and_predictions.sort_values('Predictions', ascending=False)

AL_cy_young_actual_and_predictions['Predicted_Rank'] = list(range(1,AL_cy_young_actual_and_predictions.shape[0]+1))
NL_cy_young_actual_and_predictions['Predicted_Rank'] = list(range(1,NL_cy_young_actual_and_predictions.shape[0]+1))

In [14]:
def average_precision_finder(rankings_DataFrame):
    actual_top_5 = rankings_DataFrame.sort_values('Share', ascending=False).head(5)
    predictions = rankings_DataFrame.sort_values('Predictions', ascending=False)
    precisions = []
    found = 0
    checked = 1
    for index, row in predictions.iterrows():
        if row['Name'] in actual_top_5['Name'].values:
            found += 1
            precisions.append(found/checked)
        checked += 1
    return (sum(precisions)) / (len(precisions))

In [15]:
print('Average precision for the AL: ' + str(average_precision_finder(AL_cy_young_actual_and_predictions)),
      'Average precision for the NL: ' + str(average_precision_finder(NL_cy_young_actual_and_predictions)), sep='\n')

Average precision for the AL: 0.7433333333333334
Average precision for the NL: 0.6238095238095238


In [16]:
seasons = list(range(2003,2023))

In [17]:
average_precisions_AL = []
all_predictions_AL = []
for season in seasons[5:]:
    training_data = AL_DataFrame[AL_DataFrame['Season'] < season]
    testing_data = AL_DataFrame[AL_DataFrame['Season'] == season]
    AL_regression_instance.fit(training_data[features], training_data['Share'])
    predictions = AL_regression_instance.predict(testing_data[features]) 
    predictions = pd.DataFrame(predictions, columns=['Predictions'], index=testing_data.index)
    actual_and_predictions = pd.concat([testing_data[['Name', 'Share']], predictions], axis=1)
    all_predictions_AL.append(actual_and_predictions)
    average_precisions_AL.append(average_precision_finder(actual_and_predictions))

In [18]:
average_precisions_NL = []
all_predictions_NL = []
for season in seasons[5:]:
    training_data = NL_DataFrame[NL_DataFrame['Season'] < season]
    testing_data = NL_DataFrame[NL_DataFrame['Season'] == season]
    NL_regression_instance.fit(training_data[features], training_data['Share'])
    predictions = NL_regression_instance.predict(testing_data[features]) 
    predictions = pd.DataFrame(predictions, columns=['Predictions'], index=testing_data.index)
    actual_and_predictions = pd.concat([testing_data[['Name', 'Share']], predictions], axis=1)
    all_predictions_NL.append(actual_and_predictions)
    average_precisions_NL.append(average_precision_finder(actual_and_predictions))

In [19]:
AL_average_ap = (sum(average_precisions_AL)) / (len(average_precisions_AL))
NL_average_ap = (sum(average_precisions_NL)) / (len(average_precisions_NL))

In [20]:
print('Average average precision for the AL vs 2022 average precision: ' + str(AL_average_ap) + ' vs '
                                              + str(average_precision_finder(AL_cy_young_actual_and_predictions)),
      'Average average precision for the NL vs 2022 average precision: ' + str(NL_average_ap) + ' vs '
                                              + str(average_precision_finder(NL_cy_young_actual_and_predictions)), sep='\n')

Average average precision for the AL vs 2022 average precision: 0.7781341602247853 vs 0.7433333333333334
Average average precision for the NL vs 2022 average precision: 0.7309073099392792 vs 0.6238095238095238


In [21]:
def ranking_difference_calculator(share_DataFrame):
    share_DataFrame = share_DataFrame.sort_values('Share', ascending=False)
    share_DataFrame['Rank'] = list(range(1,share_DataFrame.shape[0]+1))
    share_DataFrame = share_DataFrame.sort_values('Predictions', ascending=False)
    share_DataFrame['Predicted_Rank'] = list(range(1,share_DataFrame.shape[0]+1))
    share_DataFrame['Rank_Difference'] = share_DataFrame['Rank'] - share_DataFrame['Predicted_Rank']
    return share_DataFrame

In [22]:
def backtest(league_stats, model_instance, seasons, features):    
    average_precisions = []
    all_predictions = []
    for season in seasons:
        training_data = league_stats[league_stats['Season'] < season]
        testing_data = league_stats[league_stats['Season'] == season]
        model_instance.fit(training_data[features], training_data['Share'])
        predictions = model_instance.predict(testing_data[features]) 
        predictions = pd.DataFrame(predictions, columns=['Predictions'], index=testing_data.index)
        actual_and_predictions = pd.concat([testing_data[['Name', 'Share']], predictions], axis=1)
        actual_and_predictions = ranking_difference_calculator(actual_and_predictions)
        all_predictions.append(actual_and_predictions)
        average_precisions.append(average_precision_finder(actual_and_predictions))
    return ((sum(average_precisions))/(len(average_precisions)), average_precisions, 
             pd.concat(all_predictions), model_instance.coef_)

In [23]:
regression_instance = Ridge(alpha=0.1)

In [24]:
AL_mean_average_precision, AL_average_precisions, AL_all_predictions, AL_coefficients = backtest(AL_DataFrame,
                                                                                                 regression_instance,
                                                                                                 seasons[5:], features)
NL_mean_average_precision, NL_average_precisions, NL_all_predictions, NL_coefficients = backtest(NL_DataFrame,
                                                                                                 regression_instance,
                                                                                                 seasons[5:], features)

In [25]:
AL_regression_coefficients = pd.concat([pd.Series(AL_coefficients), pd.Series(features)], axis=1)
NL_regression_coefficients = pd.concat([pd.Series(NL_coefficients), pd.Series(features)], axis=1)

In [26]:
AL_regression_coefficients = AL_regression_coefficients.sort_values(0, ascending=False)
NL_regression_coefficients = NL_regression_coefficients.sort_values(0, ascending=False)

In [27]:
AL_ratios = (AL_DataFrame[['W', 'ERA', 'SO', 'IP', 'Season']].groupby('Season')
                                                            .apply(lambda group: group/group.mean()))
NL_ratios = (NL_DataFrame[['W', 'ERA', 'SO', 'IP', 'Season']].groupby('Season')
                                                            .apply(lambda group: group/group.mean()))

In [28]:
ratio_columns = [column + '_ratios' for column in ['W', 'ERA', 'SO', 'IP']]
AL_DataFrame[ratio_columns] = AL_ratios[['W', 'ERA', 'SO', 'IP']]
NL_DataFrame[ratio_columns] = NL_ratios[['W', 'ERA', 'SO', 'IP']]

In [29]:
features += ratio_columns

In [30]:
AL_mean_average_precision_upd, AL_average_precisions_upd, AL_all_predictions_upd, AL_coefficients_upd = backtest(AL_DataFrame,
                                                                                                        regression_instance,
                                                                                                        seasons[5:], features)
NL_mean_average_precision_upd, NL_average_precisions_upd, NL_all_predictions_upd, NL_coefficients_upd = backtest(NL_DataFrame,
                                                                                                        regression_instance,
                                                                                                        seasons[5:], features)

In [31]:
print('Average average precision for the AL --> Original: ' + str(AL_mean_average_precision) +
                                          ' --> Updated ' + str(AL_mean_average_precision_upd),
      'Average average precision for the NL --> Original: ' + str(NL_mean_average_precision) +
                                          ' --> Updated ' + str(NL_mean_average_precision_upd), sep='\n')

Average average precision for the AL --> Original: 0.7781341602247853 --> Updated 0.7783606572402303
Average average precision for the NL --> Original: 0.7309073099392792 --> Updated 0.7110870364950906


In [32]:
average_precision_data = [['American League', AL_mean_average_precision, AL_mean_average_precision_upd],
                          ['National League', NL_mean_average_precision, NL_mean_average_precision_upd]]
average_precision_DataFrame = pd.DataFrame(average_precision_data, columns=['League', 'Ridge_regression',
                                                                                      'Ridge_regression_upd'])

In [33]:
AL_DataFrame.to_csv('../Data/AL_data_updated.csv', index=False)
NL_DataFrame.to_csv('../Data/NL_data_updated.csv', index=False)

average_precision_DataFrame.to_csv('../Data/error_metrics.csv', index=False)