## A Step-by-Step Walkthrough of the Next Pitch Prediction Model. 


#### Importing the necessary libraries

In [75]:
import next_pitch
from next_pitch import library as lib
from next_pitch import pitch_functions
from next_pitch import data_collection
import os

In [76]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Load Data

The data used in this model is too large to store as a ```csv``` file on Github, so for the purpose of the walkthrough the data must be collected fresh using ```statsapi```. The function ```get_clean_data``` uses several other functions found in the ```data_collection.py``` file to pull in data, clean it and return a dataframe of all pitches thrown in every Major League Baseball game during the specified period.

In [77]:
# pitch_data = data_collection.get_clean_data(start_date='03/28/2018', end_date='10/02/2018')

In [78]:
# len(pitch_data)

## Create Binary Labels for Pitch Prediction: Fastball = 1 and Offspeed = 0.

Using the ```binarize_target``` as the final cleaning measure for our data. This function turns the target variable 'pitch_type' into a a binary outcome. A pitcher's main goal to confuse the hitters timing, and by helping a hitter recognize fastball vs non-fastball that will go along way as to helping them become a better hitter.

This step wasn't added to the intial cleaning because for future models, I am looking to create an accurate that model that can specify three different types of pitches.

In [79]:
# final_df = data_collection.binarize_target(pitch_data)

In [80]:
# final_df.to_csv(r'raw_data/all_2018_pitches.csv', sep=',', encoding='utf-8')

In [81]:
# final_df.head()

## Model Creation

Define the classifier that will be used to run the model. A gradient boosted trees model was selected because it performed the highest during the intial EDA period. 

In [82]:
classifier = lib.GradientBoostingClassifier(n_estimators=200, max_depth=10)

Create a testing parameter so model example. This uses a line from a unseen data source that will test the outcome of the model for example purposes. For the purposes of this test, the line is taken from game data from the 2019 season. 

# Collection of testing data

One of the issues with predicitng the next pitch live is that the MLB doesn't immeadtiley update various columns that are used in the original model. To work around this, I have created 

In [4]:
final_df_test = lib.pd.read_csv('raw_data/all_2018_pitches.csv')

In [5]:
final_df_test = final_df_test.drop(['Unnamed: 0', 'about.atBatIndex', 'details.call.description', 'details.description', 
                                    'matchup.pitcher.id'], axis=1)

In [6]:
# test_data = data_collection.get_clean_data(start_date='05/06/2019', end_date='07/06/2019')

In [7]:
# test_data.to_csv(r'raw_data/2019_test_pitches.csv', sep=',', encoding='utf-8')

In [8]:
test_data1 = lib.pd.read_csv('raw_data/2019_test_pitches.csv')

In [9]:
test_data = test_data1.drop(['Unnamed: 0', 'about.atBatIndex', 'details.call.description', 'details.description', 
                                    'matchup.pitcher.id'], axis=1)

In [10]:
final_df_test.head()

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,31.44,4.0,3.0,0.0,0.0,1.0-2.0
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,2.66,14.0,4.0,0.0,0.0,2.0-2.0
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,3.82,14.0,5.0,0.0,0.0,3.0-2.0
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,35.64,11.0,6.0,0.0,0.0,4.0-2.0


In [11]:
test_data.head()

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Pablo Sandoval,0.0,0.0,0.0,0,Jose Peraza,0.416,0.742,2.3,bottom,8,R,R,Men_On,35.326612,12.0,2.0,0.0,0.0,1.0-1.0
1,Pablo Sandoval,0.0,0.0,0.0,0,Jose Peraza,0.416,0.742,2.3,bottom,8,R,R,Men_On,35.326612,14.0,3.0,0.0,0.0,1.0-2.0
2,Pablo Sandoval,0.0,0.0,0.0,0,Jose Peraza,0.416,0.742,2.3,bottom,8,R,R,Men_On,35.326612,11.0,4.0,0.0,0.0,2.0-2.0
3,Aaron Brooks,0.1,1.13,0.0,1,Joey Wendle,0.435,0.789,4.3,top,7,L,R,Men_On,35.326612,11.0,2.0,1.0,1.0,0.0-2.0
4,Aaron Brooks,0.1,1.13,0.0,1,Joey Wendle,0.435,0.789,4.3,top,7,L,R,Men_On,35.326612,13.0,3.0,0.0,1.0,0.0-2.0


In [12]:
#nasty_factor_mean = final_df_test['pitchData.nastyFactor'].mean()

In [13]:
#test_data['pitchData.nastyFactor'] = test_data['pitchData.nastyFactor'].fillna(value=nasty_factor_mean)

In [169]:
test_target = test_data['pitch_type']

In [170]:
test_predictors = test_data.drop(['pitch_type'], axis=1).copy()

In [171]:
testing = test_predictors[-442:-441]
testing

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
147932,John Means,-0.2,1.8,13.5,4,Jeff Mathis,0.272,0.544,0.2,bottom,5,R,L,Men_On,35.326612,6.0,3.0,0.0,2.0-1.0


In [17]:
# final = pitch_functions.final_model(X_test=test_predictors, y_test=test_target, dataframe=final_df_test, classifier=classifier)

Accuracy:0.694
F1-Score: 0.693
AUC: 0.682
None


In [19]:
import pickle

In [20]:
with open('final_test.pkl', 'wb') as f:
    pickle.dump(final, f)

In [175]:
user_input = {'pitcher' : 'Austin Adams',
'hitter' :'Michael Brantley',
'about.halfInning' : 'top',
'about.inning' : [8],
'matchup.batSide.code' : 'R',
'matchup.pitchHand.code' : 'L',
'matchup.splits.menOnBase' : 'Men_On',
'pitchData.nastyFactor': 0,
 'pitchData.zone' : 0,
'pitchNumber' : 44,
'pitch_type' : 0,
'prior_pitch_type' : [0],
'count' : '2.0-2.0'}

In [176]:
type(user_input)

dict

In [177]:
live_df = lib.pd.DataFrame.from_dict(user_input)

In [178]:
created_test = data_collection.merge_player_stats(live_df)

In [179]:
final_df_test.head(1)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0


In [180]:
created_test

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Austin Adams,0.1,4.0,0.0,0,Michael Brantley,0.468,0.832,3.6,top,8,R,L,Men_On,0,0,44,0,0,2.0-2.0


In [181]:
# def format_user_input(pitcher, hitter, inning_type, inning_number, 
#                       pitcher_side, hitter_side, runners, prior_pitch, count):
    

In [182]:
with open('final_test.pkl', 'rb') as f:
    model = pickle.load(f)

In [183]:
nasty_factor_mean = final_df_test['pitchData.nastyFactor'].mean()

In [184]:
zone_mean = final_df_test['pitchData.zone'].mean()

In [185]:
created_test['pitchData.nastyFactor'] = nasty_factor_mean

In [186]:
created_test['pitchData.zone'] = zone_mean

In [187]:
created_test.shape

(1, 20)

In [188]:
final_df_test.shape

(553609, 20)

In [161]:
final_df_test.head(1)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,top,9,R,R,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0


In [173]:
testing

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
147932,John Means,-0.2,1.8,13.5,4,Jeff Mathis,0.272,0.544,0.2,bottom,5,R,L,Men_On,35.326612,6.0,3.0,0.0,2.0-1.0


In [174]:
created_test

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,inning_type,inning_number,hitter_side,pitcher_side,runners,pitchData.nastyFactor,pitchData.zone,pitch_number,pitch_type,prior_pitch,count
0,Austin Adams,0.1,4.0,0.0,0,Michael Brantley,0.468,0.832,3.6,top,8,R,L,Men_On,35.326612,9.875101,44,0,0,2.0-2.0


In [167]:
tester = lib.np.array(created_test)

In [200]:
preds = model.predict_proba(created_test)

In [201]:
off_speed = preds[0][0]
off_speed

0.722212034694945

In [203]:
fastball = preds[0][1]
fastball

0.27778796530505495

In [191]:
zone_mean

9.875101380216002

In [None]:
def create_user_dict():

In [192]:
def format_user_input(user_dict):
    live_df = lib.pd.DataFrame.from_dict(user_dict)
    created_test = data_collection.merge_player_stats(live_df)
    created_test['pitchData.nastyFactor'] = 35.326
    created_test['pitchData.zone'] = 9.8751
    return created_test
    
    

In [193]:
format_user_input(user_input)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.halfInning,about.inning,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Austin Adams,0.1,4.0,0.0,0,Michael Brantley,0.468,0.832,3.6,top,8,R,L,Men_On,35.326,9.8751,44,0,0,2.0-2.0
