# Model Creation: Round 2


#### Importing the necessary libraries

In [1]:
import library as lib
import pitch_functions

Using TensorFlow backend.


In [2]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [72]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions

In [4]:
lib.os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Load Data

In [5]:
each_pitch = lib.pd.read_csv('raw_data/cleaned_pitches.csv')

## Final Model

In [6]:
pitch_clean = each_pitch.copy()

In [7]:
pitch_clean = pitch_clean[:10000]

Create Binary Labels for Pitch Prediction: Fastball = 1 and Offspeed = 0.

In [8]:
pitch_dicts = {'Fastball': 1,
              'Breaking_Ball': 0,
              'Changeup': 0}

In [9]:
pitch_clean['pitch_type'] = pitch_clean['pitch_type'].map(pitch_dicts)

In [10]:
pitch_clean['prior_pitch_type'] = pitch_clean['prior_pitch_type'].map(pitch_dicts)

### Create target and predictor data frames

In [11]:
target = pitch_clean['pitch_type']

In [12]:
predictors = pitch_clean.drop(['pitch_type', 'hitter', 'pitcher'], axis=1)

### Create categorical and numerical lists for the data pipeline. 

In [13]:
num_features = list(predictors.select_dtypes(exclude='object'))

In [14]:
num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone', 'matchup.pitcher.id'}]

In [16]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count', 'matchup.pitcher.id'])

### Create steps for data pipeline using cells above

In [15]:
numeric_transformer = Pipeline(steps=[('keeper', None)])

In [17]:
cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])

In [18]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])

In [19]:
classifiers = [GradientBoostingClassifier(n_estimators=200, max_depth=10, learning_rate=.1), 
               xgb.XGBClassifier(n_estimators=200, max_depth=10, learning_rate=.1)]

## Model Creation

In [103]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=10)

In [97]:
X_train_pre = preprocessor.fit_transform(X_train)

In [98]:
X_test_pre = preprocessor.fit_transform(X_test)


In [99]:
type(X_test_pre)

scipy.sparse.csr.csr_matrix

In [100]:
clf = GradientBoostingClassifier(n_estimators=200, max_depth=10)

In [101]:
clf.fit(X_train_pre, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [102]:
clf.predict(X_test_pre)

ValueError: Number of features of the model must match the input. Model n_features is 153 and input n_features is 148 

## TO DO TOMORROW
- Use Model to Make Predictions on new data for Flask app
- Functionize this new method 
- Make slides
- Model is good

In [75]:
from pitch_functions import run_classifier_models

In [76]:
run_classifier_models(classifiers, X_train, X_test, y_train, y_test)

NameError: name 'Pipeline' is not defined