# Dependencies

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Read in Data

In [2]:
advanced_df = pd.read_csv('bball_data/FanGraphs/FanGraphsAdvanced2019_Batting.csv')
battedBall_df = pd.read_csv('bball_data/FanGraphs/FanGraphsBattedBall2019_Batting.csv')
pitchValue_df = pd.read_csv('bball_data/FanGraphs/FanGraphsPitchVal2019_Batting.csv')
plateDis_df = pd.read_csv('bball_data/FanGraphs/FanGraphsPlateDis2019_Batting.csv')
standard_df = pd.read_csv('bball_data/FanGraphs/FanGraphsStandard2019_Batting.csv')
statcast_df = pd.read_csv('bball_data/FanGraphs/FanGraphsStatcast2019_Batting.csv')
statsPlus_df = pd.read_csv('bball_data/FanGraphs/FanGraphsStatsPlus2019_Batting.csv')
value_df = pd.read_csv('bball_data/FanGraphs/FanGraphsValue2019_Batting.csv')
winProb_df = pd.read_csv('bball_data/FanGraphs/FanGraphsWinProb2019_Batting.csv')

df_list = [advanced_df, battedBall_df, pitchValue_df, plateDis_df, standard_df, statcast_df,
           statsPlus_df, value_df, winProb_df]

In [3]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['playerid'],
                                            how='outer'), df_list)

df_merged = df_merged.rename(columns={"Name_x": "Name", "Team_x": "Team"}, errors="raise")
df_merged = df_merged.loc[:,~df_merged.columns.duplicated()]

df_merged

Unnamed: 0,Name,Team,PA_x,BB%,K%,BB/K,AVG_x,OBP,SLG,OPS,...,WPA,-WPA,+WPA,RE24,REW,pLI,phLI,PH,WPA/LI,Clutch
0,Will Smith,Giants,2,0.00%,0.00%,0.0,1.0,1.0,1.0,2.0,...,0.09,-0.01,0.10,1.41,0.15,1.09,,0,0.08,0.00
1,Oliver Drake,Rays,1,0.00%,0.00%,0.0,1.0,1.0,1.0,2.0,...,0.00,0.00,0.00,0.94,0.10,0.03,,0,0.06,0.00
2,Ruben Alaniz,- - -,1,0.00%,0.00%,0.0,1.0,1.0,1.0,2.0,...,0.09,0.00,0.09,0.91,0.09,0.99,,0,0.09,0.00
3,Seth Lugo,Mets,1,0.00%,0.00%,0.0,1.0,1.0,1.0,2.0,...,0.05,0.00,0.05,0.86,0.07,0.78,,0,0.07,0.00
4,John Brebbia,Cardinals,1,0.00%,0.00%,0.0,1.0,1.0,1.0,2.0,...,0.00,0.00,0.00,0.39,0.04,0.01,,0,0.05,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1405,Huascar Ynoa,Braves,0,0.00%,0.00%,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,0,,
1406,Elvis Luciano,Blue Jays,1,0.00%,100.00%,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,-0.30,-0.02,0.13,,0,-0.03,0.00
1407,Yusei Kikuchi,Mariners,2,0.00%,50.00%,0.0,0.0,0.0,0.0,0.0,...,0.03,-0.01,0.04,0.56,0.06,0.99,,0,0.00,0.03
1408,Emmanuel Clase,Rangers,0,0.00%,0.00%,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,0,,


In [4]:
df_reduced = df_merged.loc[df_merged.Events >= 100]
df_reduced = df_reduced.reset_index(drop=True)

cols = ['Name', 'Team', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO', 'playerid', 'GB/FB', 'LD%', 'GB%',
        'FB%', 'IFFB%', 'HR/FB', 'Pull%','Cent%', 'Oppo%','Soft%','Med%','Hard%','O-Swing%','Z-Swing%','Swing%',
        'O-Contact%','Z-Contact%','Contact%','Zone%','F-Strike%','SwStr%', 'G','AB','PA_y','H','1B','2B','3B','HR',
        'R','RBI', 'BB','IBB','SO','HBP','SF','SH','GDP','SB','CS','Events','EV','maxEV','LA','Barrels','Barrel%',
        'HardHit','HardHit%',]

In [5]:
df_master = df_reduced[cols]

s = (df_master.dtypes == 'object')
object_cols = list(s[s].index)
object_cols.remove('Name')
object_cols.remove('Team')

print(object_cols)

['BB%', 'K%', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'Barrel%', 'HardHit%']


### Convert Percentages to Float

In [6]:
df_objects = df_master[object_cols]

for x in object_cols:
    df_master[x] = df_objects[x].str.rstrip('%').astype('float') / 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master[x] = df_objects[x].str.rstrip('%').astype('float') / 100


In [7]:
df_master.head()

Unnamed: 0,Name,Team,BB%,K%,BB/K,OBP,SLG,OPS,ISO,playerid,...,SB,CS,Events,EV,maxEV,LA,Barrels,Barrel%,HardHit,HardHit%
0,Christian Yelich,Brewers,0.138,0.203,0.68,0.429,0.671,1.1,0.342,11477,...,30,2,374,93.3,117.9,11.3,59.0,0.158,182.0,0.487
1,Mike Trout,Angels,0.183,0.2,0.92,0.438,0.645,1.083,0.353,10155,...,11,2,354,90.9,116.6,22.2,62.0,0.175,155.0,0.438
2,Yordan Alvarez,Astros,0.141,0.255,0.55,0.412,0.655,1.067,0.342,19556,...,0,0,221,92.2,117.9,13.4,36.0,0.163,107.0,0.484
3,Alex Bregman,Astros,0.172,0.12,1.43,0.423,0.592,1.015,0.296,17678,...,5,1,479,89.3,107.4,19.6,22.0,0.046,178.0,0.372
4,Nelson Cruz,Twins,0.107,0.251,0.43,0.392,0.639,1.031,0.328,2434,...,0,1,327,93.7,117.0,13.1,61.0,0.187,168.0,0.514


In [8]:
df_master.describe()

Unnamed: 0,BB%,K%,BB/K,OBP,SLG,OPS,ISO,playerid,GB/FB,LD%,...,SB,CS,Events,EV,maxEV,LA,Barrels,Barrel%,HardHit,HardHit%
count,406.0,406.0,406.0,406.0,406.0,406.0,406.0,406.0,406.0,406.0,...,406.0,406.0,406.0,406.0,406.0,406.0,406.0,406.0,406.0,406.0
mean,0.084998,0.224554,0.405246,0.324483,0.43816,0.762611,0.184025,11965.147783,1.256281,0.213303,...,5.179803,1.896552,281.82266,88.710591,110.289655,12.932512,19.85468,0.068884,105.162562,0.367653
std,0.030772,0.061204,0.183605,0.037132,0.080601,0.109708,0.062101,4658.072785,0.443659,0.032927,...,7.11803,2.134754,120.582017,2.15858,3.356739,4.338615,14.42635,0.039485,52.634994,0.076129
min,0.019,0.039,0.07,0.209,0.221,0.433,0.048,785.0,0.49,0.119,...,0.0,0.0,100.0,80.7,99.9,-0.1,0.0,0.0,17.0,0.073
25%,0.062,0.183,0.28,0.304,0.38325,0.68825,0.141,9059.75,0.94,0.19,...,1.0,0.0,173.25,87.3,107.8,9.8,8.0,0.039,62.0,0.321
50%,0.082,0.2235,0.38,0.325,0.434,0.754,0.1805,12857.0,1.18,0.2135,...,3.0,1.0,270.0,88.8,110.4,13.0,17.0,0.064,99.0,0.3715
75%,0.104,0.265,0.5,0.348,0.49925,0.83475,0.2265,15459.75,1.4975,0.236,...,7.0,3.0,379.75,90.1,112.775,15.9,29.0,0.09475,148.0,0.42
max,0.183,0.456,1.43,0.438,0.671,1.1,0.357,20123.0,3.26,0.339,...,46.0,10.0,559.0,96.0,118.9,24.5,68.0,0.256,257.0,0.582


In [9]:
df_master.corr()['HR']

BB%           0.324951
K%           -0.023103
BB/K          0.261463
OBP           0.506814
SLG           0.777426
OPS           0.742752
ISO           0.800279
playerid     -0.002363
GB/FB        -0.383262
LD%           0.006348
GB%          -0.376687
FB%           0.378199
IFFB%        -0.055249
HR/FB         0.711453
Pull%         0.314883
Cent%        -0.166198
Oppo%        -0.301631
Soft%        -0.285688
Med%         -0.504848
Hard%         0.588636
O-Swing%     -0.062933
Z-Swing%      0.149653
Swing%       -0.032979
O-Contact%   -0.096300
Z-Contact%   -0.131752
Contact%     -0.134768
Zone%        -0.385830
F-Strike%    -0.245599
SwStr%        0.099168
G             0.649408
AB            0.726745
PA_y          0.751897
H             0.717421
1B            0.507214
2B            0.643904
3B            0.172860
HR            1.000000
R             0.829066
RBI           0.903960
BB            0.719856
IBB           0.483622
SO            0.710276
HBP           0.354780
SF         

In [10]:
# High Correlated Features
features = ['HR', 'FB%', 'Pull%', 'BB%', 'Hard%', 'EV', 'LA', 'Barrel%', 'HardHit%']
ml_df = df_master[features]

X = ml_df.drop(['HR'], axis=1)
y = ml_df.HR

## Train & Test Model

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [13]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
FB%,10.070029
Pull%,22.101157
BB%,7.925791
Hard%,26.040217
EV,0.84056
LA,0.007217
Barrel%,110.097204
HardHit%,-0.924033


In [14]:
y_pred = regressor.predict(X_test)

In [15]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
175,13,32.226448
239,15,16.071522
60,22,14.489644
231,15,14.314348
134,11,13.440140
...,...,...
20,38,24.303725
220,18,24.644314
14,34,24.671763
274,10,11.976473


In [16]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 6.410148425145429
Mean Squared Error: 71.91240256903455
Root Mean Squared Error: 8.480118075182359


In [17]:
mask = df_master.loc[df_master.Name == 'Christian Yelich']
new_input = mask[features].drop(['HR'], axis=1)

In [18]:
new_input

Unnamed: 0,FB%,Pull%,BB%,Hard%,EV,LA,Barrel%,HardHit%
0,0.359,0.393,0.138,0.508,93.3,11.3,0.158,0.487


In [19]:
new_output = regressor.predict(new_input)
new_output

array([32.81662666])

In [20]:
names = []

for x in df_master.Name:
    name = x
    names.append(name)
    
name_df = pd.DataFrame()
name_df['Name'] = names

name_df.head()

Unnamed: 0,Name
0,Christian Yelich
1,Mike Trout
2,Yordan Alvarez
3,Alex Bregman
4,Nelson Cruz


In [21]:
outputs = []

for x in df_master.Name:
    test = df_master.loc[df_master.Name == x]
    new_input = test[features].drop(['HR'], axis=1)
    output = regressor.predict(new_input)
    outputs.append(output)
    
    
# outputs

In [22]:
values = []

for x in outputs:
    value = round(x[0], 0)
    values.append(value)
    
# values

In [23]:
name_df['Actual_HR'] = df_master['HR']
name_df['Pred_HR'] = values

name_df['Pred_HR'] = name_df['Pred_HR'].astype(int)

name_df

Unnamed: 0,Name,Actual_HR,Pred_HR
0,Christian Yelich,44,33
1,Mike Trout,45,33
2,Yordan Alvarez,27,33
3,Alex Bregman,41,18
4,Nelson Cruz,41,37
...,...,...,...
401,Tomas Nido,4,7
402,Josh Harrison,1,9
403,Lewis Brinson,0,11
404,Chris Owings,3,14


In [24]:
name_df.loc[name_df['Pred_HR'] == name_df['Pred_HR'].max()]

Unnamed: 0,Name,Actual_HR,Pred_HR
10,Joey Gallo,22,50


In [25]:
HR_df = name_df.sort_values(by=['Pred_HR'], ascending=False).reset_index(drop=True)

In [26]:
HR_df.head(10)

Unnamed: 0,Name,Actual_HR,Pred_HR
0,Joey Gallo,22,50
1,Miguel Sano,34,43
2,Aaron Judge,27,40
3,Nelson Cruz,41,37
4,Gary Sanchez,34,35
5,Matt Olson,36,34
6,Jorge Soler,48,34
7,Josh Donaldson,37,33
8,Christian Yelich,44,33
9,Mike Trout,45,33


In [28]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [33]:
regression_results(y_test, abs(y_pred))

explained_variance:  0.4063
mean_squared_log_error:  0.2246
r2:  0.4027
MAE:  6.3502
MSE:  71.5527
RMSE:  8.4589


In [34]:
HR_df.loc[HR_df['Name'] == 'Alex Bregman']

Unnamed: 0,Name,Actual_HR,Pred_HR
156,Alex Bregman,41,18


In [58]:
X2 = df_master[features].drop(['HR'], axis=1)
X2 = X2.drop(['HardHit%'], axis=1)
X2 = X2.drop(['Hard%'], axis=1)
X2 = X2.drop(['LA'], axis=1)

In [69]:
# X2 = X2
# y2 = df_master.HR
# model = sm.OLS(y2,X2)
# results = model.fit()
# print(results.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [68]:
X2 = df_master[test_features].drop(['HR'], axis=1)
X2 = df_master.drop(['HardHit', 'HardHit%'], axis=1)

In [64]:
test_features = ['BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO',
       'playerid', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Pull%',
       'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'O-Swing%', 'Z-Swing%',
       'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%',
       'SwStr%', 'G', 'AB', 'PA_y', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI',
       'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'Events', 'EV',
       'maxEV', 'LA', 'Barrels', 'Barrel%', 'HardHit', 'HardHit%']