# Model Creation: Round 2


#### Importing the necessary libraries

In [1]:
import next_pitch
from next_pitch import library
from next_pitch import pitch_functions

Using TensorFlow backend.


In [2]:
from next_pitch.pitch_functions import run_classifier_models, roc_auc_score, calc_acc_and_f1_score

In [3]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [4]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
# import pitch_functions

In [5]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Load Data

In [6]:
each_pitch = pd.read_csv('raw_data/final_pitches.csv')

In [7]:
test_pitch = pd.read_csv('raw_data/cleaned_pitches1.csv')

In [8]:
test_pitch = test_pitch.fillna(value=0)


In [9]:
test_pitch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 17 columns):
about.atBatIndex            200 non-null int64
about.halfInning            200 non-null object
about.inning                200 non-null int64
details.call.description    200 non-null object
details.description         200 non-null object
matchup.batSide.code        200 non-null object
matchup.batter.fullName     200 non-null object
matchup.pitchHand.code      200 non-null object
matchup.pitcher.fullName    200 non-null object
matchup.pitcher.id          200 non-null int64
matchup.splits.menOnBase    200 non-null object
pitchData.nastyFactor       200 non-null float64
pitchData.zone              200 non-null float64
pitchNumber                 200 non-null float64
pitch_type                  200 non-null object
prior_pitch_type            200 non-null object
count                       200 non-null object
dtypes: float64(3), int64(3), object(11)
memory usage: 26.6+ KB


# Model Data

Create Binary Labels for Pitch Prediction: Fastball = 1 and Offspeed = 0.

In [94]:
pitch_dicts = {'Fastball': 1,
               '0': 1,
              'Breaking_Ball': 0,
              'Changeup': 0}

In [95]:
pitch_clean = each_pitch.dropna().copy()

In [96]:
pitch_clean = pitch_clean[:10000]

In [97]:
len(pitch_clean)

10000

In [98]:
pitch_clean['pitch_type'] = pitch_clean['pitch_type'].map(pitch_dicts)

In [99]:
pitch_clean['prior_pitch_type'] = pitch_clean['prior_pitch_type'].map(pitch_dicts)

### Model Targets and Predictors

In [100]:
target = pitch_clean['pitch_type']

In [101]:
predictors = pitch_clean.drop(['pitch_type', 'hitter', 'pitcher' ], axis=1)

### MODEL NUM AND CAT FEATURES

In [102]:
num_features = list(predictors.select_dtypes(exclude='object'))
num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]

In [103]:
num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]

In [104]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count', 'matchup.pitcher.id'])

# LIVE DATA TEST

In [105]:
new_pitch = test_pitch.copy()

In [106]:
new_pitch['pitch_type'] = new_pitch['pitch_type'].map(pitch_dicts)

In [107]:
new_pitch['prior_pitch_type'] = new_pitch['prior_pitch_type'].map(pitch_dicts)

### Create target and predictor data frames

In [108]:
new_target = new_pitch['pitch_type']

In [109]:
new_preds = new_pitch.drop(['matchup.batter.fullName', 'matchup.pitcher.fullName'], axis=1)

### Create categorical and numerical lists for the data pipeline. 

In [110]:
new_n = list(new_preds.select_dtypes(exclude='object'))

In [111]:
num_features1 = [i for i in new_n if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]

In [112]:
cat_features1 = list(new_preds.select_dtypes(include='object',))
cat_features1.extend(['about.inning', 'count', 'matchup.pitcher.id'])

### Create steps for data pipeline using cells above

In [113]:
numeric_transformer = Pipeline(steps=[('keeper', None)])

In [114]:
cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])

In [115]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])

In [116]:
preprocessor1 = ColumnTransformer(transformers=[('num', numeric_transformer, num_features1),
                                              ('cat', cat_transfomer, cat_features1)])

In [117]:
classifiers = [GradientBoostingClassifier(n_estimators=200, max_depth=10, learning_rate=.1), 
               xgb.XGBClassifier(n_estimators=200, max_depth=10, learning_rate=.1)]

### New TEST ROW FORMATION

In [118]:
new_preds = new_preds

In [119]:
last_pred = new_preds[-1:]

In [120]:
last_pred

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
199,45,bottom,5,Strike - Swinging,Swinging Strike,R,L,596057,Empty,0.0,0.0,4.0,1.0,0.0,1.0-3.0


In [121]:
last_pred_pre = preprocessor1.fit(last_pred).transform(last_pred)

In [122]:
last_pred

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
199,45,bottom,5,Strike - Swinging,Swinging Strike,R,L,596057,Empty,0.0,0.0,4.0,1.0,0.0,1.0-3.0


## Model Creation

In [152]:
def final_model(predictors, target, new_pitch):
    
    target = pitch_clean['pitch_type']
    predictors = pitch_clean.drop(['pitch_type', 'hitter', 'pitcher' ], axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=10)
    
    num_features = list(predictors.select_dtypes(exclude='object'))
    num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]
    numeric_transformer = Pipeline(steps=[('keeper', None)])
    cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
    
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', classifiers_test)])
    pipe.fit(X_train, y_train)
    print(pitch_functions.calc_acc_and_f1_score(y_test, pipe.predict(X_test)))
    prediction = pipe.predict(new_pitch)
    
    
    if prediction == 1:
        print ('Fastball')
    else: 
        print ('Off_Speed')
    
    

In [153]:
final_model(predictors, target, testing)

Accuracy:0.782
F1-Score: 0.779
AUC: 0.762
None
Fastball


In [154]:
testing = X_test[-1:]