In [1]:
## 2 - August - 2023

In [2]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [3]:
error_metrics_DataFrame = pd.read_csv('../Data/error_metrics.csv')

AL_DataFrame = pd.read_csv('../Data/AL_data_updated.csv')
NL_DataFrame = pd.read_csv('../Data/NL_data_updated.csv')

In [4]:
AL_DataFrame['Team_ID'] = AL_DataFrame['Team'].astype('category').cat.codes
NL_DataFrame['Team_ID'] = NL_DataFrame['Team'].astype('category').cat.codes

In [5]:
features = ['Age', 'W', 'L', 'W-L%', 'W-L%_undefined', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV',
            'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP',
            'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/W', 'Season', 'Team_W', 'Team_L', 'Team_W-L%',
            'GB', 'W_ratios', 'ERA_ratios', 'SO_ratios', 'IP_ratios', 'Team_ID']

In [6]:
random_forest_instance = RandomForestRegressor(min_samples_split=5, n_estimators=50, random_state=28)

In [7]:
def ranking_difference_calculator(share_DataFrame):
    share_DataFrame = share_DataFrame.sort_values('Share', ascending=False)
    share_DataFrame['Rank'] = list(range(1,share_DataFrame.shape[0]+1))
    share_DataFrame = share_DataFrame.sort_values('Predictions', ascending=False)
    share_DataFrame['Predicted_Rank'] = list(range(1,share_DataFrame.shape[0]+1))
    share_DataFrame['Rank_Difference'] = share_DataFrame['Rank'] - share_DataFrame['Predicted_Rank']
    return share_DataFrame

In [8]:
def average_precision_finder(rankings_DataFrame):
    actual_top_5 = rankings_DataFrame.sort_values('Share', ascending=False).head(5)
    predictions = rankings_DataFrame.sort_values('Predictions', ascending=False)
    precisions = []
    found = 0
    checked = 1
    for index, row in predictions.iterrows():
        if row['Name'] in actual_top_5['Name'].values:
            found += 1
            precisions.append(found/checked)
        checked += 1
    return (sum(precisions)) / (len(precisions))

In [9]:
def backtest(league_stats, model_instance, seasons, features):    
    average_precisions = []
    all_predictions = []
    for season in seasons:
        training_data = league_stats[league_stats['Season'] < season]
        testing_data = league_stats[league_stats['Season'] == season]
        model_instance.fit(training_data[features], training_data['Share'])
        predictions = model_instance.predict(testing_data[features]) 
        predictions = pd.DataFrame(predictions, columns=['Predictions'], index=testing_data.index)
        actual_and_predictions = pd.concat([testing_data[['Name', 'Share']], predictions], axis=1)
        actual_and_predictions = ranking_difference_calculator(actual_and_predictions)
        all_predictions.append(actual_and_predictions)
        average_precisions.append(average_precision_finder(actual_and_predictions))
    return ((sum(average_precisions))/(len(average_precisions)), average_precisions, pd.concat(all_predictions))

In [10]:
AL_mean_average_precision, AL_average_precisions, AL_all_predictions = backtest(AL_DataFrame, random_forest_instance,
                                                                                list(range(2008,2023)), features)
NL_mean_average_precision, NL_average_precisions, NL_all_predictions = backtest(NL_DataFrame, random_forest_instance,
                                                                                list(range(2008,2023)), features)

In [11]:
error_metrics_DataFrame['Random_Forest'] = pd.Series([AL_mean_average_precision, NL_mean_average_precision])

In [12]:
error_metrics_DataFrame

Unnamed: 0,League,Ridge_regression,Ridge_regression_upd,Random_Forest
0,American League,0.778134,0.778361,0.864881
1,National League,0.730907,0.711087,0.864228
