## A Step-by-Step Walkthrough of the Next Pitch Prediction Model. 


#### Importing the necessary libraries

In [75]:
import next_pitch
from next_pitch import library as lib
from next_pitch import pitch_functions
from next_pitch import data_collection
import os

In [76]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Load Data

The data used in this model is too large to store as a ```csv``` file on Github, so for the purpose of the walkthrough the data must be collected fresh using ```statsapi```. The function ```get_clean_data``` uses several other functions found in the ```data_collection.py``` file to pull in data, clean it and return a dataframe of all pitches thrown in every Major League Baseball game during the specified period.

In [77]:
# pitch_data = data_collection.get_clean_data(start_date='03/28/2018', end_date='10/02/2018')

In [78]:
# len(pitch_data)

## Create Binary Labels for Pitch Prediction: Fastball = 1 and Offspeed = 0.

Using the ```binarize_target``` as the final cleaning measure for our data. This function turns the target variable 'pitch_type' into a a binary outcome. A pitcher's main goal to confuse the hitters timing, and by helping a hitter recognize fastball vs non-fastball that will go along way as to helping them become a better hitter.

This step wasn't added to the intial cleaning because for future models, I am looking to create an accurate that model that can specify three different types of pitches.

In [79]:
# final_df = data_collection.binarize_target(pitch_data)

In [80]:
# final_df.to_csv(r'raw_data/all_2018_pitches.csv', sep=',', encoding='utf-8')

In [81]:
# final_df.head()

## Model Creation

Define the classifier that will be used to run the model. A gradient boosted trees model was selected because it performed the highest during the intial EDA period. 

In [82]:
classifier = lib.GradientBoostingClassifier(n_estimators=200, max_depth=10)

Create a testing parameter so model example. This uses a line from a unseen data source that will test the outcome of the model for example purposes. For the purposes of this test, the line is taken from game data from the 2019 season. 

# Collection of testing data

One of the issues with predicitng the next pitch live is that the MLB doesn't immeadtiley update various columns that are used in the original model. To work around this, I have created 

In [4]:
final_df_test = lib.pd.read_csv('raw_data/all_2018_pitches.csv')

In [5]:
final_df_test = final_df_test.drop(['Unnamed: 0', 'about.atBatIndex', 'details.call.description', 'details.description', 
                                    'matchup.pitcher.id'], axis=1)

In [6]:
# test_data = data_collection.get_clean_data(start_date='05/06/2019', end_date='07/06/2019')

In [7]:
# test_data.to_csv(r'raw_data/2019_test_pitches.csv', sep=',', encoding='utf-8')

In [8]:
test_data1 = lib.pd.read_csv('raw_data/2019_test_pitches.csv')

In [9]:
test_data = test_data1.drop(['Unnamed: 0', 'about.atBatIndex', 'details.call.description', 'details.description', 
                                    'matchup.pitcher.id'], axis=1)

In [10]:
final_df_test.head()

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,31.44,4.0,3.0,0.0,0.0,1.0-2.0
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,2.66,14.0,4.0,0.0,0.0,2.0-2.0
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,3.82,14.0,5.0,0.0,0.0,3.0-2.0
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,35.64,11.0,6.0,0.0,0.0,4.0-2.0


In [11]:
test_data.head()

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Pablo Sandoval,0.0,0.0,0.0,0,Jose Peraza,0.416,0.742,2.3,bottom,8,R,R,Men_On,35.326612,12.0,2.0,0.0,0.0,1.0-1.0
1,Pablo Sandoval,0.0,0.0,0.0,0,Jose Peraza,0.416,0.742,2.3,bottom,8,R,R,Men_On,35.326612,14.0,3.0,0.0,0.0,1.0-2.0
2,Pablo Sandoval,0.0,0.0,0.0,0,Jose Peraza,0.416,0.742,2.3,bottom,8,R,R,Men_On,35.326612,11.0,4.0,0.0,0.0,2.0-2.0
3,Aaron Brooks,0.1,1.13,0.0,1,Joey Wendle,0.435,0.789,4.3,top,7,L,R,Men_On,35.326612,11.0,2.0,1.0,1.0,0.0-2.0
4,Aaron Brooks,0.1,1.13,0.0,1,Joey Wendle,0.435,0.789,4.3,top,7,L,R,Men_On,35.326612,13.0,3.0,0.0,1.0,0.0-2.0


In [12]:
#nasty_factor_mean = final_df_test['pitchData.nastyFactor'].mean()

In [13]:
#test_data['pitchData.nastyFactor'] = test_data['pitchData.nastyFactor'].fillna(value=nasty_factor_mean)

In [169]:
test_target = test_data['pitch_type']

In [170]:
test_predictors = test_data.drop(['pitch_type'], axis=1).copy()

In [171]:
testing = test_predictors[-442:-441]
testing

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
147932,John Means,-0.2,1.8,13.5,4,Jeff Mathis,0.272,0.544,0.2,bottom,5,R,L,Men_On,35.326612,6.0,3.0,0.0,2.0-1.0


In [17]:
# final = pitch_functions.final_model(X_test=test_predictors, y_test=test_target, dataframe=final_df_test, classifier=classifier)

Accuracy:0.694
F1-Score: 0.693
AUC: 0.682
None


In [19]:
import pickle

In [20]:
with open('final_test.pkl', 'wb') as f:
    pickle.dump(final, f)

In [234]:
# user_input = {'pitcher' : 'Justin Verlander',
# 'hitter' :'Dee Gordon',
# 'about.halfInning' : 'top',
# 'about.inning' : [8],
# 'matchup.batSide.code' : 'R',
# 'count_false' : '2.0-2.0',
# 'matchup.pitchHand.code' : 'L',
# 'matchup.splits.menOnBase' : 'Men_On',
# 'pitchData.nastyFactor': 0,
#  'pitchData.zone' : 0,
# 'pitchNumber' : 44,
# 'pitch_type' : 0,
# 'prior_pitch_type' : [0]}

In [280]:
user_input = {'prior_pitch_type': '1', 'pitchNumber': 88.0, 'matchup.splits.menOnBase': 'Men_On', 'matchup.pitchHand.code': 'R', 'matchup.batSide.code': 'L', 'about.halfInning': 'top', 'about.inning': '6', 'hitter': 'Dee Gordon', 'pitcher': 'Justin Verlander', 'pitch_type': 1, 'pitchData.nastyFactor': 35.326, 'pitchData.zone': 9.8751, 'count': '1.0-1.0'}


In [275]:
new_dict = {'pitcher' : user_input['pitcher'],
'Dee Gordon': user_input['hitter'] ,
'about.halfInning' : user_input['about.halfInning'],
'about.inning' :user_input['about.inning'],
'matchup.batSide.code' : user_input['matchup.batSide.code'],
'matchup.pitchHand.code' : user_input['matchup.pitchHand.code'],
'matchup.splits.menOnBase' : user_input['matchup.splits.menOnBase'],
'pitchData.nastyFactor': user_input['pitchData.nastyFactor'],
'pitchData.zone' : user_input['pitchData.zone'],
'pitchNumber' :user_input['pitchNumber'],
'pitch_type' :user_input['pitch_type'],
'prior_pitch_type' : [user_input['prior_pitch_type'],
'count' : user_input['count_false']
                      }

SyntaxError: invalid syntax (<ipython-input-275-c6a365d6ae90>, line 13)

In [265]:
user_input['count'] = user_input['count_false']

TypeError: 'set' object is not subscriptable

In [281]:
user_input

{'prior_pitch_type': '1',
 'pitchNumber': 88.0,
 'matchup.splits.menOnBase': 'Men_On',
 'matchup.pitchHand.code': 'R',
 'matchup.batSide.code': 'L',
 'about.halfInning': 'top',
 'about.inning': '6',
 'hitter': 'Dee Gordon',
 'pitcher': 'Justin Verlander',
 'pitch_type': 1,
 'pitchData.nastyFactor': 35.326,
 'pitchData.zone': 9.8751,
 'count': '1.0-1.0'}

In [282]:
type(user_input)

dict

In [238]:
live_df = lib.pd.DataFrame.from_dict(user_input)

In [239]:
created_test = data_collection.merge_player_stats(live_df)

In [328]:
final_df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553609 entries, 0 to 553608
Data columns (total 20 columns):
pitcher                     553609 non-null object
WAR_x                       553609 non-null float64
WHIP                        553609 non-null float64
ERA                         553609 non-null float64
SO                          553609 non-null int64
hitter                      553609 non-null object
SLG                         553609 non-null float64
OPS                         553609 non-null float64
WAR_y                       553609 non-null float64
about.halfInning            553609 non-null object
about.inning                553609 non-null int64
matchup.batSide.code        553609 non-null object
matchup.pitchHand.code      553609 non-null object
matchup.splits.menOnBase    553609 non-null object
pitchData.nastyFactor       553609 non-null float64
pitchData.zone              553609 non-null float64
pitchNumber                 553609 non-null float64
pitch_type     

In [240]:
final_df_test.head(1)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0


In [241]:
created_test

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,count_false,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Justin Verlander,6.2,0.9,2.52,290,Dee Gordon,0.349,0.637,0.6,top,8,R,2.0-2.0,L,Men_On,0,0,44,0,0,2.0-2.0


In [210]:
# def format_user_input(pitcher, hitter, inning_type, inning_number, 
#                       pitcher_side, hitter_side, runners, prior_pitch, count):
    

In [308]:
with open('next_pitch/web_app/final.pkl', 'rb') as f:
    model = pickle.load(f)

In [309]:
model.steps

[('preprocessor',
  ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
           transformer_weights=None,
           transformers=[('num', Pipeline(memory=None, steps=[('keeper', None)]), ['WAR_x', 'WHIP', 'ERA', 'SO', 'SLG', 'OPS', 'WAR_y', 'pitchData.nastyFactor', 'pitchNumber', 'prior_pitch_type']), ('cat', Pipeline(memory=None,
       steps=[('onehot', OneHotEncoder(categorical_features=None, categories='auto',
  ...p.pitchHand.code', 'matchup.splits.menOnBase', 'count', 'about.inning', 'pitchData.zone', 'count'])])),
 ('classifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
                learning_rate=0.1, loss='deviance', max_depth=10,
                max_features=None, max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=200,
                n_iter_no_change=None, presort='auto', rand

In [310]:
final_df_test.head(1)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0


In [311]:
testing

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
147932,John Means,-0.2,1.8,13.5,4,Jeff Mathis,0.272,0.544,0.2,bottom,5,R,L,Men_On,35.326612,6.0,3.0,0.0,2.0-1.0


In [312]:
created_test

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,count_false,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Justin Verlander,6.2,0.9,2.52,290,Dee Gordon,0.349,0.637,0.6,top,8,R,2.0-2.0,L,Men_On,0,0,44,0,0,2.0-2.0


In [313]:
preds = model.predict_proba(created_test)

In [314]:
off_speed = preds[0][0]
off_speed

0.10468301726178753

In [315]:
fastball = preds[0][1]
fastball

0.8953169827382125

In [324]:
users = {'count': '1.0-1.0', 'prior_pitch_type': 1.0, 'pitchNumber': 67.0, 'matchup.splits.menOnBase': 'Men_On', 'matchup.pitchHand.code': 'R', 'matchup.batSide.code': 'L', 'about.halfInning': 'top', 'about.inning': 6, 'hitter': 'Dee Gordon', 'pitcher': 'Justin Verlander', 'pitch_type': 1.0, 'pitchData.nastyFactor': 35.326, 'pitchData.zone': 9.8751}

In [325]:
def format_user_input(user_dict):
    live_df = lib.pd.DataFrame([user_dict])
    created_test = data_collection.merge_player_stats(live_df)
    created_test = created_test[test_list]
    return created_test
    
    

In [326]:
t = format_user_input(users)

In [318]:
final_df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553609 entries, 0 to 553608
Data columns (total 20 columns):
pitcher                     553609 non-null object
WAR_x                       553609 non-null float64
WHIP                        553609 non-null float64
ERA                         553609 non-null float64
SO                          553609 non-null int64
hitter                      553609 non-null object
SLG                         553609 non-null float64
OPS                         553609 non-null float64
WAR_y                       553609 non-null float64
about.halfInning            553609 non-null object
about.inning                553609 non-null int64
matchup.batSide.code        553609 non-null object
matchup.pitchHand.code      553609 non-null object
matchup.splits.menOnBase    553609 non-null object
pitchData.nastyFactor       553609 non-null float64
pitchData.zone              553609 non-null float64
pitchNumber                 553609 non-null float64
pitch_type     

In [321]:
t.columns == final_df_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [327]:
t

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Justin Verlander,6.2,0.9,2.52,290,Dee Gordon,0.349,0.637,0.6,top,6,L,R,Men_On,35.326,9.8751,67.0,1.0,1.0,1.0-1.0


In [320]:
model.predict_proba(t)

array([[0.3719817, 0.6280183]])