# Model Creation: Round 2


#### Importing the necessary libraries

In [5]:
import library as lib
import pitch_functions

In [50]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [51]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions

In [52]:
lib.os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Load Data

In [119]:
each_pitch = lib.pd.read_csv('raw_data/cleaned_pitches.csv')

## Final Model

In [148]:
pitch_clean = each_pitch.copy()

In [149]:
# pitch_clean = pitch_clean[:10000]

Create Binary Labels for Pitch Prediction: Fastball = 1 and Offspeed = 0.

In [150]:
pitch_dicts = {'Fastball': 1,
              'Breaking_Ball': 0,
              'Changeup': 0}

In [151]:
pitch_clean['pitch_type'] = pitch_clean['pitch_type'].map(pitch_dicts)

In [152]:
pitch_clean['prior_pitch_type'] = pitch_clean['prior_pitch_type'].map(pitch_dicts)

In [153]:
target = pitch_clean['pitch_type']

In [154]:
predictors = pitch_clean.drop(['pitch_type', 'hitter', 'pitcher'], axis=1)

In [155]:
num_features = list(predictors.select_dtypes(exclude='object'))

In [156]:
num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]

In [157]:
num_features

['WAR_x',
 'WHIP',
 'ERA',
 'SO',
 'SLG',
 'OPS',
 'WAR_y',
 'about.atBatIndex',
 'pitchData.nastyFactor',
 'pitchNumber',
 'prior_pitch_type']

In [158]:
numeric_transformer = Pipeline(steps=[('keeper', None)])

In [159]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count', 'matchup.pitcher.id'])

In [160]:
cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])

In [161]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])

In [162]:
classifiers = [GradientBoostingClassifier(n_estimators=200, max_depth=10, learning_rate=.1), 
               xgb.XGBClassifier(n_estimators=200, max_depth=10, learning_rate=.1)]

## Model Creation

In [163]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=10)

In [164]:
def run_classifier_models(classifiers, X_train, X_test, y_train, y_test):
    for classifier in classifiers:
        #Intialize classifier pipeline
        clf1 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', classifier)])
        clf1.fit(X_train, y_train)
        
        # Print accuracy metrics for each model using pitch_functions.py
        print(classifier)
        print('\n')
        print('Training Metrics')
        pitch_functions.calc_acc_and_f1_score(y_train, clf1.predict(X_train))
        print('\n')
        print('Testing Metrics')
        pitch_functions.calc_acc_and_f1_score(y_test, clf1.predict(X_test))
        print('\n')

- Need to get back to the drawing board
- lets try dividing this up a little bit
- national league teams
- american league teams
- start with the houston astros 

In [165]:
run_classifier_models(classifiers, X_train, X_test, y_train, y_test)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


Training Metrics
Accuracy:0.818
F1-Score: 0.815
AUC: 0.797


Testing Metrics
Accuracy:0.788
F1-Score: 0.784
AUC: 0.766


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_

In [80]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(clf1.predict(X_test), y_test)

plt.imshow(cnf_matrix,  cmap=plt.cm.Blues) #Create the basic matrix.

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

#Add appropriate Axis Scales
class_names = set(y_test) #Get class labels to add to matrix
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=45)
plt.yticks(tick_marks, class_names)

#Add Labels to Each Cell
thresh = cnf_matrix.max() / 2. #Used for text coloring below
#Here we iterate through the confusion matrix and append labels to our visualization.
for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, cnf_matrix[i, j],
                 horizontalalignment="center",
                 color="white" if cnf_matrix[i, j] > thresh else "black")

#Add a Side Bar Legend Showing Colors
plt.colorbar()

NameError: name 'clf1' is not defined