In [12]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [13]:

file = pd.read_csv('../data/Inplaydata19.csv', error_bad_lines=False)
dataFrame = pd.DataFrame(file)



In [14]:
# Single, Double, Triple, HomeRun, (field_out, force_out, fielders_choice) = out, 
# grounded_into_double_play, sac_fly, sac_bunt,

#Transform values into binary classification
d = {
    'single': 1, 
    'double': 1, 
    'triple': 1, 
    'home_run': 1, 
    'field_out': 0,
    'force_out': 0,
    'fielders_choice': 1,
    'grounded_into_double_play': 0,
    'sac_fly': 0,
    'sac_bunt': 0,
    'double_play': 0,
    'fielders_choice_out': 0,
    'catcher_interf': 1,
    'triple_play': 0,
    'field_error': 1,
    'sac_fly_double_play': 0,
    'sac_bunt_double_play': 0
}

dataFrame = dataFrame.replace(d)
#dataFrame=dataFrame.sample(frac=1).reset_index(drop=True)

In [15]:
X = pd.DataFrame()
dataFrame=dataFrame[dataFrame['hc_y']!=198]
dataFrame= dataFrame.dropna(subset=['launch_speed','launch_angle','hc_x','hc_y'])
dataFrame=dataFrame.sample(frac=1).reset_index()
batterlist=dataFrame['batter']
indexlist=dataFrame['index']
X['launch_speed'] = dataFrame['launch_speed']
X['launch_angle'] = dataFrame['launch_angle']*np.pi/180
X['launch_speed_angle'] = dataFrame['launch_speed_angle']
X['estimated_ba_using_speedangle'] = dataFrame['estimated_ba_using_speedangle']
X['hc_x']=125-dataFrame['hc_x']
X['hc_y']=198-dataFrame['hc_y']
X['events'] = dataFrame['events']
X.insert(0, 'Ones', 1)


y = X['events']

X = X.iloc[:, :-1]


In [16]:


X['hor_rad']=np.arctan(X['hc_x']/X['hc_y'])
X['hor_deg']=X['hor_rad']*(180/np.pi)

In [17]:
meanspd=np.mean(X['launch_speed'])
stdspd=np.std(X['launch_speed'])
meanang=np.mean(X['launch_angle'])
stdang=np.std(X['launch_angle'])
stdhor=np.std(X['hor_rad'])


In [18]:
normX=pd.DataFrame()
normX['launch_speed']=(X['launch_speed']-meanspd)/stdspd
normX['launch_angle']=(X['launch_angle']-meanang)/stdang
normX['hor_angle']=(X['hor_rad'])/stdhor

## SVM Stuff

In [39]:
#Trim data
normX=normX.iloc[:,:3]
sliceX = normX.iloc[0:10000]
slicey = y.iloc[0:10000]
sliceX2 = normX.iloc[10000:20000]
slicey2 = y.iloc[10000:20000]

In [None]:
hyperparams = { 
    'C': np.linspace(1, 100, 100),
    'gamma': np.linspace(0.00, 5, 100),
    'class_weight': ['balanced', None],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

rs = RandomizedSearchCV(svm.SVC(), hyperparams, cv=10, n_iter=25, verbose=2, scoring='accuracy',n_jobs=-1)
rs.fit(sliceX, slicey)
print(rs.best_params_)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 20.7min


In [None]:

rbf_svc = svm.SVC(kernel='rbf', C=68, gamma=0.55, class_weight='balanced')
scoresRBF = cross_val_score(rbf_svc, sliceX2, slicey2, cv=10, scoring='accuracy')
print(np.mean(scoresRBF))

## Random Forest

In [40]:
rf_hyperparams = { 
    'n_estimators': np.arange(100, 1000, 10),
    'max_features': ['auto', 'sqrt'],
    'max_depth': np.arange(10,100,10),
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'bootstrap':[True,False]
    
}
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(rf, rf_hyperparams, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(sliceX,slicey)
print(rf_random.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.0min finished


{'n_estimators': 160, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}


In [41]:

bestpar = rf_random.best_params_
rf=RandomForestClassifier(n_estimators=bestpar['n_estimators'],min_samples_split=bestpar['min_samples_split'],
                         min_samples_leaf=bestpar['min_samples_leaf'],max_depth=bestpar['max_depth'],max_features=bestpar['max_features'],
                        bootstrap=bestpar['bootstrap'],n_jobs=-1, random_state=42)


In [42]:
scores_rf = cross_val_score(rf, sliceX2, slicey2, cv=10, scoring='accuracy')
classifier=rf.fit(sliceX,slicey)
print(np.mean(scores_rf))

0.817393491193491


In [43]:
results=pd.DataFrame()
predictions=classifier.predict_proba(normX)
results['launch_speed']=normX['launch_speed']*stdspd+meanspd
results['launch_angle']=normX['launch_angle']*stdang+meanang
results['hor_angle']=normX['hor_angle']*stdhor
results['predictions']=predictions[:,1]

## Aggregation by Player

In [51]:
normX=normX.iloc[:,:3]
normX['predictions']=classifier.predict_proba(normX)[:,1]


In [52]:
normX['batter']=batterlist
#normX['regspeed']=normX['launch_speed']*stdspd+meanspd
#normX['reglang']=normX['launch_angle']*stdang+meanang
#normX['reghang']=normX['hor_angle']*stdhor

In [53]:
pred=pd.DataFrame()
pred['batter']=normX['batter']
pred['predictions']=normX['predictions']
pred['actba']=y

In [54]:
grouped=pred.groupby('batter')

In [55]:
predictor=pd.DataFrame()
#predictor['batter']=grouped.describe()['batter']
predictor['count']=grouped.describe()['predictions']['count']
predictor['exba']=grouped.describe()['predictions']['mean']
predictor['BABIP']=grouped.describe()['actba']['mean']
predictor=predictor.reset_index()

In [56]:

batters=pd.read_csv('../data/batterdat1.csv')
strikeouts=pd.read_csv('../data/Strikeouts.csv')
batters['Name']=batters['player_name']
full=strikeouts.merge(batters,on='Name',how='outer')
full=full.dropna()
#joiner=pd.DataFrame()
#joiner['batter']=full['player_id']
#joiner['Name']=full['Name']
#joiner['Team']=full['Team']
#joiner['AB']=full['AB']
#joiner['Batting_Avg']=full['ba']
full['batter']=full['player_id']
predictor=predictor.merge(full,on='batter',how='inner')
#predictor['Ex ABs']=predictor['count']

In [58]:
summary=pd.DataFrame()
summary['Name']=predictor['Name']
summary['Team']=predictor['Team']
summary['Balls in Play']=predictor['count']
summary['Average on Balls in Play']=predictor['BABIP']
summary['Ex Average on Balls in Play']=predictor['exba']
summary['Difference']=predictor['BABIP']-predictor['exba']
Error=abs(predictor['BABIP']-predictor['exba'])

In [59]:
np.mean(Error)

0.02056565845609552

In [60]:
summary.sort_values(by=['Difference']).head(10)


Unnamed: 0,Name,Team,Balls in Play,Average on Balls in Play,Ex Average on Balls in Play,Difference
89,Marcell Ozuna,Cardinals,370.0,0.324324,0.396693,-0.072369
272,Danny Jansen,Blue Jays,269.0,0.275093,0.328619,-0.053526
72,Derek Dietrich,Reds,177.0,0.282486,0.335516,-0.05303
49,Justin Smoak,Blue Jays,309.0,0.291262,0.342309,-0.051046
4,Robinson Cano,Mets,323.0,0.315789,0.36613,-0.050341
266,Rowdy Tellez,Blue Jays,257.0,0.330739,0.37914,-0.048401
291,Ryan O'Hearn,Royals,231.0,0.277056,0.323127,-0.046071
12,Kurt Suzuki,Nationals,247.0,0.307692,0.352859,-0.045167
78,Jonathan Lucroy,- - -,245.0,0.277551,0.321792,-0.044241
28,Welington Castillo,White Sox,158.0,0.316456,0.359089,-0.042633


In [61]:
summary.sort_values(by=['Difference'],ascending=False).head(10)

Unnamed: 0,Name,Team,Balls in Play,Average on Balls in Play,Ex Average on Balls in Play,Difference
305,Fernando Tatis Jr.,Padres,226.0,0.486726,0.388463,0.098263
253,Tim Anderson,White Sox,389.0,0.442159,0.372693,0.069467
240,Victor Reyes,Tigers,214.0,0.406542,0.345916,0.060627
300,Brandon Lowe,Rays,183.0,0.448087,0.388098,0.059989
261,Garrett Hampson,Rockies,215.0,0.362791,0.31043,0.052361
158,Delino DeShields,Rangers,267.0,0.355805,0.30393,0.051875
94,Jon Berti,Marlins,184.0,0.407609,0.356161,0.051448
230,Kevin Newman,Pirates,433.0,0.374134,0.322937,0.051197
233,David Dahl,Rockies,270.0,0.425926,0.375238,0.050688
133,Brian Goodwin,Angels,287.0,0.38676,0.336299,0.05046


In [63]:
summary.to_csv('../aggdata.csv')


## Charts

### Launch Speed v Vert Angle

In [None]:
testX=np.linspace(-2,2,40) # (minsd,maxsd,numpoints) Launch Speed
testy=np.linspace(-2,2,40) # (minsd,maxsd,numpoints) Launch Angle
a=[]
b=[]
for i in testX:
    for z in testy:
        a.append(i)
        b.append(z)

In [None]:
defhor=10 #default Horizontal Angle
samp=pd.DataFrame()
samp['launch_speed']=a
samp['launch_angle']=b
samp['hor_angle']=np.pi*defhor/(stdhor*180)
predictionsb=classifier.predict_proba(samp)
test=pd.DataFrame()
test['predictions']=predictionsb[:,1]
test['regspeed']=samp['launch_speed']*stdspd+meanspd
test['reglang']=samp['launch_angle']*stdang+meanang

In [None]:
plt.figure()
color = [[0,0,0]]
for i in range(0,800):
    colored=test['predictions'][i]
    color=[colored,0,colored] # where we change color plots
    plt.polar(test.loc[i, 'reglang'], test.loc[i, 'regspeed'],c=color,marker='o')
plt.show()

In [None]:

plt.figure()
color = [[0,0,0]]
for i in range(2400,2800):
    colored=results['predictions'][i]
    color=[colored,0,colored] # where we change color plots
    plt.polar(results.loc[i, 'launch_angle'], results.loc[i, 'launch_speed'],c=color,marker='o')
plt.show()

### Launch Speed v Horizontal Angle

In [None]:
testX=np.linspace(-2,2,40)# (minsd,maxsd,numpoints) Launch Speed
testy=np.linspace(-1.5,1.5,30)# (minsd,maxsd,numpoints) Horizontal Angle
a=[]
b=[]
for i in testX:
    for z in testy:
        a.append(i)
        b.append(z)

In [None]:
deflaunch=20 #default Vertical Launch Angle
samp2=pd.DataFrame()
samp2['launch_speed']=a
samp2['launch_angle']=((np.pi*deflaunch/180)-meanang)/stdang
samp2['hor_angle']=b
predictionsb=classifier.predict_proba(samp2)
test2=pd.DataFrame()
test2['predictions']=predictionsb[:,1]
test2['regspeed']=samp2['launch_speed']*stdspd+meanspd
test2['reghang']=samp2['hor_angle']*stdhor

In [None]:
plt.figure()
color = [[0,0,0]]
for i in range(0,800):
    colored=test2['predictions'][i]
    color=[0,colored,colored] #where we change color plots
    plt.polar(test2.loc[i, 'reghang']+np.pi/2, test2.loc[i, 'regspeed'],c=color,marker='o')
plt.show()

In [None]:
plt.figure()
color = [[0,0,0]]
for i in range(2400,2800):
    colored=results['predictions'][i]
    color=[0,colored,colored] # where we change color plots
    plt.polar(results.loc[i, 'hor_angle']+np.pi/2, results.loc[i, 'launch_speed'],c=color,marker='o')
plt.show()