In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
stat_2017 = pd.read_csv('statcast/statcast2017.csv')
stat_2018 = pd.read_csv('statcast/statcast2018.csv')
stat_2019 = pd.read_csv('statcast/statcast2019.csv')
stat_2020 = pd.read_csv('statcast/statcast2020.csv')


In [3]:
df_2017 = stat_2017.loc[stat_2017['b_total_pa'] >= 502]
df_2018 = stat_2018.loc[stat_2018['b_total_pa'] >= 502]
df_2019 = stat_2019.loc[stat_2019['b_total_pa'] >= 502]
df_2020 = stat_2020.loc[stat_2020['b_total_pa'] >= 186]

df_list = [df_2017, df_2018, df_2019, df_2020]

In [5]:
df_merged = df_2017.append([df_2018, df_2019, df_2020], ignore_index=True)
df_merged

Unnamed: 0,last_name,first_name,year,player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,...,woba,xwoba,xobp,xiso,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel_batted_rate,whiff_percent,swing_percent
0,Beltran,Carlos,2017,40,467,509,108,65,29,0,...,0.283,0.293,0.291,0.143,87.3,13.4,29.1,4.6,19.8,47.8
1,Pujols,Albert,2017,37,593,636,143,103,17,0,...,0.286,0.322,0.305,0.190,88.7,13.4,31.5,5.4,20.6,47.1
2,Mauer,Joe,2017,34,525,597,160,116,36,1,...,0.349,0.379,0.391,0.170,90.6,6.2,35.7,4.5,12.9,36.4
3,Cabrera,Miguel,2017,34,469,529,117,79,22,0,...,0.313,0.377,0.361,0.247,91.3,12.5,41.7,10.5,23.1,50.3
4,Phillips,Brandon,2017,36,572,604,163,115,34,1,...,0.316,0.307,0.308,0.115,84.7,8.3,31.3,2.4,20.2,56.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,Longoria,Evan,2020,35,193,209,49,31,10,1,...,0.303,0.354,0.331,0.246,91.7,10.7,29.9,11.5,21.3,45.0
558,Cabrera,Miguel,2020,37,204,231,51,37,4,0,...,0.318,0.361,0.368,0.231,93.2,12.1,36.8,9.7,31.6,47.7
559,Martinez,J.D.,2020,33,211,237,45,22,16,0,...,0.285,0.304,0.307,0.216,89.5,14.7,39.6,11.0,28.1,51.9
560,Abreu,Jose,2020,33,240,262,76,42,15,0,...,0.404,0.379,0.358,0.285,92.9,10.9,37.4,14.3,30.1,48.5


In [6]:
df_merged.corr()['b_home_run']

features = ['b_home_run', 'player_age', 'b_total_hits', 'exit_velocity_avg', 'whiff_percent', 'swing_percent', 'launch_angle_avg']

ml_df = df_merged[features]

In [7]:
ml_df

Unnamed: 0,b_home_run,player_age,b_total_hits,exit_velocity_avg,whiff_percent,swing_percent,launch_angle_avg
0,14,40,108,87.3,19.8,47.8,13.4
1,23,37,143,88.7,20.6,47.1,13.4
2,7,34,160,90.6,12.9,36.4,6.2
3,16,34,117,91.3,23.1,50.3,12.5
4,13,36,163,84.7,20.2,56.1,8.3
...,...,...,...,...,...,...,...
557,7,35,49,91.7,21.3,45.0,10.7
558,10,37,51,93.2,31.6,47.7,12.1
559,7,33,45,89.5,28.1,51.9,14.7
560,19,33,76,92.9,30.1,48.5,10.9


In [8]:
X = ml_df.drop(['b_home_run'], axis=1)
y = ml_df['b_home_run']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [10]:
regressor = RandomForestRegressor(random_state=1)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)

In [11]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': preds})
df

Unnamed: 0,Actual,Predicted
505,11,7.44
408,31,26.03
66,38,19.79
338,8,17.49
233,23,31.12
...,...,...
159,23,24.84
12,26,22.30
343,49,36.79
142,24,19.75


In [12]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, preds))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, preds))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, preds)))

Mean Absolute Error: 4.827079646017698
Mean Squared Error: 40.97563185840707
Root Mean Squared Error: 6.401221122442738


In [13]:
firsts = []
lasts = []

forest_df = pd.DataFrame()

for index, row in df_merged.iterrows():
    first = row.first_name
    last = row.last_name
    
    firsts.append(first)
    lasts.append(last)
    
forest_df['first_name'] = firsts
forest_df['last_name'] = lasts

forest_df


Unnamed: 0,first_name,last_name
0,Carlos,Beltran
1,Albert,Pujols
2,Joe,Mauer
3,Miguel,Cabrera
4,Brandon,Phillips
...,...,...
557,Evan,Longoria
558,Miguel,Cabrera
559,J.D.,Martinez
560,Jose,Abreu


In [14]:
new_input = df_merged[features]

test = new_input[0:1].drop(['b_home_run'], axis=1)

In [15]:
input_df = ml_df.drop(['b_home_run'], axis=1)

In [16]:
outputs = regressor.predict(input_df)

In [26]:
forest_df['Actual_HR'] = df_merged['b_home_run']
forest_df['Predicted_HR'] = outputs.astype(int)

forest_df['diff'] = forest_df['Actual_HR'] - forest_df['Predicted_HR']

forest_df

Unnamed: 0,first_name,last_name,playerValues,Actual_HR,Predicted_HR,diff
0,Carlos,Beltran,CarlosBeltran,14,14,0
1,Albert,Pujols,AlbertPujols,23,17,6
2,Joe,Mauer,JoeMauer,7,9,-2
3,Miguel,Cabrera,MiguelCabrera,16,18,-2
4,Brandon,Phillips,BrandonPhillips,13,12,1
...,...,...,...,...,...,...
557,Evan,Longoria,EvanLongoria,7,10,-3
558,Miguel,Cabrera,MiguelCabrera,10,10,0
559,J.D.,Martinez,J.D.Martinez,7,9,-2
560,Jose,Abreu,JoseAbreu,19,17,2


In [51]:
projected_2021_forest = pd.DataFrame()

playerValueList = list(forest_df.playerValues.unique())
avgPreds = []


for each in playerValueList:
    mask = forest_df.loc[forest_df.playerValues == each]
    avgPred = mask.Predicted_HR.mean()
    avgPreds.append(avgPred)
    
avgPreds

projected_2021_forest['players'] = playerValueList
projected_2021_forest['Predicted_HR'] = avgPreds

forest_pred_df = projected_2021_forest.sort_values(by='Predicted_HR', ascending=False)

forest_pred_df.head(10).reset_index(drop=True)
    


Unnamed: 0,players,Predicted_HR
0,AaronJudge,49.0
1,JorgeSoler,46.0
2,GiancarloStanton,43.5
3,JoshDonaldson,36.0
4,KhrisDavis,35.666667
5,LoganMorrison,35.0
6,JayBruce,33.0
7,JustinUpton,33.0
8,AustinMeadows,32.0
9,DannySantana,32.0


In [18]:
playerValues = forest_df.first_name + forest_df.last_name

forest_df['playerValues'] = playerValues

In [22]:
forest_df

Unnamed: 0,first_name,last_name,playerValues,Actual_HR,Predicted_HR,diff
0,Carlos,Beltran,CarlosBeltran,14,14,0
1,Albert,Pujols,AlbertPujols,23,17,6
2,Joe,Mauer,JoeMauer,7,9,-2
3,Miguel,Cabrera,MiguelCabrera,16,18,-2
4,Brandon,Phillips,BrandonPhillips,13,12,1
...,...,...,...,...,...,...
557,Evan,Longoria,EvanLongoria,7,10,-3
558,Miguel,Cabrera,MiguelCabrera,10,10,0
559,J.D.,Martinez,J.D.Martinez,7,9,-2
560,Jose,Abreu,JoseAbreu,19,17,2


In [21]:
forest_df['Actual_HR'] = df_merged['b_home_run']
forest_df['Predicted_HR'] = outputs.astype(int)

forest_df['diff'] = forest_df['Actual_HR'] - forest_df['Predicted_HR']

df_merged['Predicted_HR'] = forest_df['Predicted_HR']

df_merged

Unnamed: 0,last_name,first_name,year,player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,...,xwoba,xobp,xiso,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel_batted_rate,whiff_percent,swing_percent,Predicted_HR
0,Beltran,Carlos,2017,40,467,509,108,65,29,0,...,0.293,0.291,0.143,87.3,13.4,29.1,4.6,19.8,47.8,14
1,Pujols,Albert,2017,37,593,636,143,103,17,0,...,0.322,0.305,0.190,88.7,13.4,31.5,5.4,20.6,47.1,17
2,Mauer,Joe,2017,34,525,597,160,116,36,1,...,0.379,0.391,0.170,90.6,6.2,35.7,4.5,12.9,36.4,9
3,Cabrera,Miguel,2017,34,469,529,117,79,22,0,...,0.377,0.361,0.247,91.3,12.5,41.7,10.5,23.1,50.3,18
4,Phillips,Brandon,2017,36,572,604,163,115,34,1,...,0.307,0.308,0.115,84.7,8.3,31.3,2.4,20.2,56.1,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,Longoria,Evan,2020,35,193,209,49,31,10,1,...,0.354,0.331,0.246,91.7,10.7,29.9,11.5,21.3,45.0,10
558,Cabrera,Miguel,2020,37,204,231,51,37,4,0,...,0.361,0.368,0.231,93.2,12.1,36.8,9.7,31.6,47.7,10
559,Martinez,J.D.,2020,33,211,237,45,22,16,0,...,0.304,0.307,0.216,89.5,14.7,39.6,11.0,28.1,51.9,9
560,Abreu,Jose,2020,33,240,262,76,42,15,0,...,0.379,0.358,0.285,92.9,10.9,37.4,14.3,30.1,48.5,17


In [None]:
df_merged['predHR_PA'] = df_merged['Predicted_HR'] / df_merged['b_total_pa']

df_merged
    

In [None]:
test_df = pd.DataFrame()

predicted_HR = df_merged.groupby('playerValues').predHR_PA.mean()
list(predicted_HR.values)
players = df_merged.playerValues.unique()
# players = players.sort()

test['players'] = players
test['predicted_HR'] = list(predicted_HR.values)

test.head()


In [None]:
players

In [None]:
players = df_merged.playerValues.unique()

In [None]:
players

In [None]:
for each in test:
    print(each)

In [None]:
for each in df_merged.groupby('playerValues')

In [None]:
forest_pred_2021 = pd.DataFrame()



In [None]:
df_merged.columns

In [None]:
forest_df

In [None]:
def regression_results(y_test, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_test, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_test, y_pred) 
    mse=metrics.mean_squared_error(y_test, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_test, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_test, y_pred)
    r2=metrics.r2_score(y_test, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))