# Model Creation: Round 2


#### Importing the necessary libraries

In [1]:
import next_pitch
from next_pitch import library
from next_pitch import pitch_functions

Using TensorFlow backend.


In [1]:
from next_pitch.pitch_functions import run_classifier_models, roc_auc_score, calc_acc_and_f1_score

Using TensorFlow backend.


In [2]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [3]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
# import pitch_functions

In [4]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Load Data

In [57]:
each_pitch = pd.read_csv('raw_data/final_pitches.csv')

In [58]:
test_pitch = pd.read_csv('raw_data/cleaned_pitches1.csv')

In [59]:
test_pitch = test_pitch.fillna(value=0)


In [60]:
test_pitch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 17 columns):
about.atBatIndex            200 non-null int64
about.halfInning            200 non-null object
about.inning                200 non-null int64
details.call.description    200 non-null object
details.description         200 non-null object
matchup.batSide.code        200 non-null object
matchup.batter.fullName     200 non-null object
matchup.pitchHand.code      200 non-null object
matchup.pitcher.fullName    200 non-null object
matchup.pitcher.id          200 non-null int64
matchup.splits.menOnBase    200 non-null object
pitchData.nastyFactor       200 non-null float64
pitchData.zone              200 non-null float64
pitchNumber                 200 non-null float64
pitch_type                  200 non-null object
prior_pitch_type            200 non-null object
count                       200 non-null object
dtypes: float64(3), int64(3), object(11)
memory usage: 26.6+ KB


# Model Data

Create Binary Labels for Pitch Prediction: Fastball = 1 and Offspeed = 0.

In [61]:
def binarize_target(dataframe):
    pitch_clean = dataframe
    pitch_dicts = {'Fastball': 1,
               '0': 1,
              'Breaking_Ball': 0,
              'Changeup': 0}
    pitch_clean['pitch_type'] = pitch_clean['pitch_type'].map(pitch_dicts)
    pitch_clean['prior_pitch_type'] = pitch_clean['prior_pitch_type'].map(pitch_dicts)
    pitch_cleaned = pitch_clean.dropna()
    return pitch_cleaned
    

In [62]:
df = binarize_target(each_pitch)

In [64]:
df.head()

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,...,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,32.94,1.0,2.0,0.0,0.0,1.0-1.0
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,31.44,4.0,3.0,0.0,0.0,1.0-2.0
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,2.66,14.0,4.0,0.0,0.0,2.0-2.0
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,3.82,14.0,5.0,0.0,0.0,3.0-2.0
5,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,35.64,11.0,6.0,0.0,0.0,4.0-2.0


## Model Creation

In [81]:
def final_model(dataframe, classifier, new_pitch):
    pitch_clean = dataframe
    target = pitch_clean['pitch_type']
    predictors = pitch_clean.drop(['pitch_type', 'hitter', 'pitcher' ], axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=10)
    
    num_features = list(predictors.select_dtypes(exclude='object'))
    num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]
    
    cat_features = list(predictors.select_dtypes(include='object'))
    cat_features.extend(['about.inning', 'pitchData.zone', 'count', 'matchup.pitcher.id'])
    
    numeric_transformer = Pipeline(steps=[('keeper', None)])
    cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
    
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    #print(next_pitch.pitch_functions.calc_acc_and_f1_score(y_test, pipe.predict(X_test)))
    prediction = pipe.predict(new_pitch)
    
    
    if prediction == 1:
        print ('Fastball')
    else: 
        print ('Off_Speed')

In [82]:
classifier = GradientBoostingClassifier()

In [83]:
df = df[:10000]

In [84]:
final_model(df, classifier, testing)

Fastball


In [68]:
testing = df[-1:]

In [69]:
testing

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,...,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
791876,Enrique Hernandez,-0.4,9.0,81.0,0,Jesmuel Valentin,0.304,0.562,-0.7,123,...,L,R,571771,Men_On,53.23,11.0,5.0,0.0,0.0,4.0-1.0


In [86]:
import pickle

In [87]:
with open('final_test.pkl', 'wb') as f:
    pickle.dump(final_model, f)