# Model Creation: First Models using data created in data frame creation


#### Importing the necessary libraries

In [1]:
import pitch_functions 
import library as lib
import pandas as pd
import os

Using TensorFlow backend.


In [2]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
each_pitch = pd.read_csv('raw_data/cleaned_pitches.csv')

## Pipeline Preparation

In [4]:
pitch_clean = each_pitch.dropna().copy()

In [5]:
pitch_clean = pitch_clean[:100000]

In [6]:
pitch_clean.head()

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,...,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,32.94,1.0,2.0,Changeup,Changeup,1.0-1.0
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,31.44,4.0,3.0,Breaking_Ball,Changeup,1.0-2.0
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,2.66,14.0,4.0,Breaking_Ball,Breaking_Ball,2.0-2.0
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,3.82,14.0,5.0,Breaking_Ball,Breaking_Ball,3.0-2.0
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,...,R,R,434778,Men_On,35.64,11.0,6.0,Breaking_Ball,Breaking_Ball,4.0-2.0


In [71]:
target = pitch_clean['pitch_type']

In [72]:
predictors = pitch_clean.drop(['pitch_type', 'hitter', 'pitcher'], axis=1)

In [73]:
num_features = list(predictors.select_dtypes(exclude='object'))

In [74]:
num_features

['WAR_x',
 'WHIP',
 'ERA',
 'SO',
 'SLG',
 'OPS',
 'WAR_y',
 'about.atBatIndex',
 'about.inning',
 'matchup.pitcher.id',
 'pitchData.nastyFactor',
 'pitchData.zone',
 'pitchNumber']

In [75]:
num_features = [i for i in num_features if i not in {'about.inning', 'pitchData.zone'}]

In [76]:
num_features

['WAR_x',
 'WHIP',
 'ERA',
 'SO',
 'SLG',
 'OPS',
 'WAR_y',
 'about.atBatIndex',
 'matchup.pitcher.id',
 'pitchData.nastyFactor',
 'pitchNumber']

In [77]:
numeric_transformer = lib.Pipeline(steps=[('keeper', None)])

In [78]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count'])

In [79]:
cat_transfomer = lib.Pipeline(steps=[('onehot', lib.OneHotEncoder(handle_unknown='ignore', categories='auto'))])

In [80]:
preprocessor = lib.ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])

In [81]:
classifiers = [lib.DecisionTreeClassifier(), lib.RandomForestClassifier(n_estimators=100, max_depth=5), 
               lib.GradientBoostingClassifier(n_estimators=100), lib.xgb.XGBClassifier()]

## Model Creation

In [82]:
X_train, X_test, y_train, y_test = lib.train_test_split(predictors, target, random_state=10)

In [83]:
def run_classifier_models(classifiers, X_train, X_test, y_train, y_test):
    for classifier in classifiers:
        #Intialize classifier pipeline
        clf1 = lib.Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', classifier)])
        clf1.fit(X_train, y_train)

        # Print accuracy metrics for each model using pitch_functions.py
        print(classifier)
        print('\n')
        print('Training Metrics')
        pitch_functions.calc_acc_and_f1_score(y_train, clf1.predict(X_train))
        print('\n')
        print('Testing Metrics')
        pitch_functions.calc_acc_and_f1_score(y_test, clf1.predict(X_test))
        print('\n')

- Need to get back to the drawing board
- lets try dividing this up a little bit
- national league teams
- american league teams
- start with the houston astros 

In [84]:
run_classifier_models(classifiers, X_train, X_test, y_train, y_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


Training Metrics
Accuracy:1.000
F1-Score: 1.000
AUC: 1.000


Testing Metrics
Accuracy:0.669
F1-Score: 0.670
AUC: 0.678


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Training Metrics


  'precision', 'predicted', average, warn_for)


Accuracy:0.656
F1-Score: 0.564
AUC: 0.551


Testing Metrics


  'precision', 'predicted', average, warn_for)


Accuracy:0.654
F1-Score: 0.562
AUC: 0.552


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


Training Metrics
Accuracy:0.736
F1-Score: 0.711
AUC: 0.674


Testing Metrics
Accuracy:0.732
F1-Score: 0.707
AUC: 0.671


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,