# Exploratory Data Analysis/Data Preparation/Data Exportation

#### The purpose of this notebook is to collect and clean data, perform preliminary eda and output dataframes as CSV for later combination in another notebook

#### Importing the necessary libraries

In [6]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [7]:
#DATA STORAGE
#from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [8]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions

In [9]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [108]:
pd.set_option('display.max_columns', None)

## Data Extraction


Importing team codes from TSV file in ```public_data direcrtory```. Using this data to construct a data frame of each teams identifier for use later. 

In [10]:
teams = pd.read_csv('public_data/team_codes.tsv', sep='\t')

teams['full_name'] = teams['full_name'].str.lower().str.replace(' ', '_').str.replace('-','_')

In [11]:
teams.head()

Unnamed: 0,code,short_name,full_name
0,108,LAA,angels
1,109,ARI,d_backs
2,110,BAL,orioles
3,111,BOS,red_sox
4,112,CHC,cubs


In [16]:
team_code = []
for code, team_name in zip(teams['code'],teams['full_name']):
    text = (f'The {team_name} has code {code}')
    team_code.append(text)
team_code_df = pd.DataFrame(team_code)
team_code_df.head(1)

Unnamed: 0,0
0,The angels has code 108


## Visuals

## Pipeline Preparation

In [128]:
pitch_clean = each_pitch_clean.dropna().copy()

In [129]:
# Lost about 600, but this is just a test
pitch_clean.head(2)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,matchup.batter.fullName,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
2,1,top,1,2.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,FT,Fastball,Fastball
3,1,top,1,2.0,1.0,S,Strike - Swinging,Swinging Strike,R,Kris Bryant,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,FT,Fastball,Fastball


In [130]:
target = pitch_clean['pitch_type']

In [184]:
predictors = pitch_clean.drop(['pitch_type', 'matchup.pitcher.fullName', 'matchup.batter.fullName'], axis=1)

In [185]:
numerical_values = list(predictors['pitchNumber'])

In [186]:
num_features = list(predictors.select_dtypes(exclude='object'))

In [187]:
numeric_transformer = Pipeline(steps=[('keeper', None)])

In [188]:
categorical_value_df = predictors.drop(['pitchNumber'], axis=1)

In [189]:
categorial_values = list(categorical_value_df)

In [190]:
cat_features = list(predictors.select_dtypes(include='object'))

In [191]:
cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder())])

In [192]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])

In [193]:
classifiers = [DecisionTreeClassifier(), RandomForestClassifier(n_estimators=100, max_depth=5), 
               GradientBoostingClassifier(n_estimators=100), xgb.XGBClassifier()]

## Model Creation

In [194]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=10)

In [197]:
#X_test[X_test['matchup.batter.fullName'].str.contains("Lucas Sims")==True]

In [196]:
for classifier in classifiers:
    #Intialize classifier pipeline
    clf1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', classifier)])
    clf1.fit(X_train, y_train)
    one_hot_names = list(clf1.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names())
    final_feats = num_features + one_hot_names
    # Print accuracy metrics for each model using pitch_functions.py
    print(classifier)
    print('\n')
    print('Training Metrics')
    pitch_functions.calc_acc_and_f1_score(y_train, clf1.predict(X_train))
    print('\n')
    print('Testing Metrics')
    pitch_functions.calc_acc_and_f1_score(y_test, clf1.predict(X_test))
    print('\n')
    

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


Training Metrics
Accuracy:1.000
F1-Score: 1.000
AUC: 1.000


Testing Metrics
Accuracy:0.627
F1-Score: 0.629
AUC: 0.646


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Training Metrics
Accuracy:0.648
F1-Score: 0.547
AUC: 0.544


Testing Metrics
Accuracy:0.652
F1-Score: 0