# Exploratory Data Analysis/Data Preparation/Data Exportation

#### The purpose of this notebook is to collect and clean data, perform preliminary eda and output dataframes as CSV for later combination in another notebook

#### Importing the necessary libraries

In [2]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [3]:
#DATA STORAGE
#from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [4]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Data Extraction
Calling the StatsApi ```https://pypi.org/project/MLB-StatsAPI/``` to collect infomation about all MLB games played between 03/28/18 and 10/03/18. The schulde function will be used in order to collect specific game identifiers, those game identifiers will be used inside of a For Loop that will extract play-by-play data from each game and create a brand new data frame with game data.

Importing team codes from TSV file in ```public_data direcrtory```. Using this data to construct a data frame of each teams identifier for use later. 

In [15]:
teams = pd.read_csv('public_data/team_codes.tsv', sep='\t')

teams['full_name'] = teams['full_name'].str.lower().str.replace(' ', '_').str.replace('-','_')

In [16]:
teams.head()

Unnamed: 0,code,short_name,full_name
0,108,LAA,angels
1,109,ARI,d_backs
2,110,BAL,orioles
3,111,BOS,red_sox
4,112,CHC,cubs


In [17]:
team_code = []
for code, team_name in zip(teams['code'],teams['full_name']):
    text = (f'The {team_name} has code {code}')
    team_code.append(text)
team_code_df = pd.DataFrame(team_code)
team_code_df.head(1)

Unnamed: 0,0
0,The angels has code 108


In [18]:
schedule = statsapi.schedule(start_date="07/03/2019", end_date="07/03/2019")

In [19]:
full = json_normalize(schedule)
gamepks= full['game_id']

In [20]:
full.head(2)

Unnamed: 0,away_id,away_name,away_pitcher_note,away_probable_pitcher,away_score,current_inning,doubleheader,game_date,game_datetime,game_id,game_num,game_type,home_id,home_name,home_pitcher_note,home_probable_pitcher,home_score,inning_state,status,summary
0,116,Detroit Tigers,Norris had the makings of a quality start last...,"Norris, Daniel",3,5.0,S,2019-07-03,2019-07-03T18:10:00Z,567273,1,R,145,Chicago White Sox,The long-awaited Major League debut of MLB Pip...,"Cease, Dylan",4,Bottom,In Progress,2019-07-03 - Detroit Tigers (3) @ Chicago Whit...
1,116,Detroit Tigers,Tyler Alexander will make his Major League deb...,"Alexander, Tyler",0,,S,2019-07-03,2019-07-04T00:10:00Z,567305,2,R,145,Chicago White Sox,The veteran left-hander picked up his first vi...,"Detwiler, Ross",0,,Scheduled,2019-07-03 - Detroit Tigers @ Chicago White So...


In [21]:
gamepks_2018 = list(gamepks.unique())
len(gamepks_2018)

16

In [22]:
test_pk = gamepks_2018

In [23]:
pd.set_option('display.max_columns', None)

## Data Frame Creation

This loop iterates through the game_pks from the above list, grabs the game data assoicated with each game pk and breaks it out into a custom human usuable pandas dataframe. The column names are defined manually so that only necessary columns are added to the data frame.
In the second loop, it iterates through play events in the current plays dataframe, this normalizes the ```.json``` nested in this data frame in the play events column. A dictionary is then defined, and several columns are added to the dictionary. This columns are used to add the prior pitch to the data frame. The prior pitch is important when determining patterns in pitcher tendencies. If/Else statements are then used to add each column and row to the dictionary which is then appended to a list, which is used to create a final dataframe.

In [38]:
list_for_final_df = []
for game in test_pk:
    curr_game = statsapi.get('game_playByPlay',{'gamePk':game})
    curr_plays = curr_game.get('allPlays')
    curr_plays_df = pd.DataFrame(curr_plays)
    curr_plays_norm = json_normalize(curr_plays)
    
    all_plays_cols = ['about.atBatIndex', 'about.halfInning', 'about.inning', 'count.balls', 'count.strikes', 'matchup.batSide.code', 
                     'matchup.batter.fullName', 'matchup.batter.id', 'matchup.pitchHand.code', 'matchup.splits.menOnBase', 'matchup.pitcher.fullName',
                     'matchup.pitcher.id', 'result.eventType']
    
    play_events_cols = ['count.balls', 'count.strikes', 'details.ballColor', 'details.call.code', 'details.call.description', 'details.type.description'
                        ,'details.call.code', 'details.description', 'details.code', 'details.type.code', 'index', 'pitchData.nastyFactor',
                       'pitchData.zone', 'pitchNumber', 'type']
    i = 1
    for index, row in curr_plays_norm.iterrows():
            play_events = json_normalize(row['playEvents'])
            
            for play_events_idx, play_events_row in play_events.iterrows():
                
                game_dict = {}
                game_dict['gamepk'] = game
                game_dict['pitch_id']  = str(game) + '_' + str(row['about.atBatIndex']) + '_' + str(i)
                game_dict['prior_pitch'] = str(game) + '_' + (str(row['about.atBatIndex']) + '_' + str(i - 1))
                
                
                for col_all_plays in all_plays_cols:
                    if col_all_plays in curr_plays_norm.columns:
                        game_dict[col_all_plays] = row[col_all_plays]
                    else:
                        game_dict[col_all_plays] = np.nan
                for col_play_events in play_events_cols:
                    if col_play_events in play_events.columns:
                        game_dict[col_play_events] = play_events_row[col_play_events]
                    else: 
                        game_dict[col_play_events] = np.nan
                
                list_for_final_df.append(game_dict)
                i += 1
                                                              
                                                              
                
                                                            
            
            

        
    


In [40]:
each_pitch = pd.DataFrame(list_for_final_df)
each_pitch

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.ballColor,details.call.code,details.call.description,details.code,details.description,details.type.code,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_id,prior_pitch,result.eventType,type
0,0,top,1,0.0,0.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),X,"In play, out(s)",FF,Four-Seam Fastball,567273,0,R,JaCoby Jones,592444,R,Dylan Cease,656302,Empty,,,1.0,567273_0_1,567273_0_0,field_out,pitch
1,1,top,1,1.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FF,Four-Seam Fastball,567273,0,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,1.0,567273_1_2,567273_1_1,field_out,pitch
2,1,top,1,2.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FF,Four-Seam Fastball,567273,1,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,2.0,567273_1_3,567273_1_2,field_out,pitch
3,1,top,1,2.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,C,Called Strike,FF,Four-Seam Fastball,567273,2,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,3.0,567273_1_4,567273_1_3,field_out,pitch
4,1,top,1,2.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FF,Four-Seam Fastball,567273,3,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,4.0,567273_1_5,567273_1_4,field_out,pitch
5,1,top,1,2.0,2.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),X,"In play, out(s)",FF,Four-Seam Fastball,567273,4,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,5.0,567273_1_6,567273_1_5,field_out,pitch
6,2,top,1,0.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,C,Called Strike,SL,Slider,567273,0,R,Nicholas Castellanos,592206,R,Dylan Cease,656302,Men_On,,,1.0,567273_2_7,567273_2_6,walk,pitch
7,2,top,1,1.0,1.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FF,Four-Seam Fastball,567273,1,R,Nicholas Castellanos,592206,R,Dylan Cease,656302,Men_On,,,2.0,567273_2_8,567273_2_7,walk,pitch
8,2,top,1,1.0,1.0,,,,,On-field Delay.,,,567273,2,R,Nicholas Castellanos,592206,R,Dylan Cease,656302,Men_On,,,,567273_2_9,567273_2_8,walk,action
9,2,top,1,1.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FF,Four-Seam Fastball,567273,3,R,Nicholas Castellanos,592206,R,Dylan Cease,656302,Men_On,,,3.0,567273_2_10,567273_2_9,walk,pitch


### NEXT STEPS:
- Add a previous pitch column 
- Scrape June 2018 and Sept 2018 hitting and pitching data
- Match the player stats to the name column in the pitch data
- Modify men on base 
- Drop play result and type column

    

In [41]:
pitch_id_df = each_pitch[['pitch_id', 'details.type.code']].copy()
pitch_id_df.head()

Unnamed: 0,pitch_id,details.type.code
0,567273_0_1,FF
1,567273_1_2,FF
2,567273_1_3,FF
3,567273_1_4,FF
4,567273_1_5,FF


In [42]:
merged_df = pd.merge(each_pitch, pitch_id_df,how='left', left_on='prior_pitch', right_on='pitch_id')

In [43]:
merged_df.head(2)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.ballColor,details.call.code,details.call.description,details.code,details.description,details.type.code_x,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_id_x,prior_pitch,result.eventType,type,pitch_id_y,details.type.code_y
0,0,top,1,0.0,0.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),X,"In play, out(s)",FF,Four-Seam Fastball,567273,0,R,JaCoby Jones,592444,R,Dylan Cease,656302,Empty,,,1.0,567273_0_1,567273_0_0,field_out,pitch,,
1,1,top,1,1.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FF,Four-Seam Fastball,567273,0,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,1.0,567273_1_2,567273_1_1,field_out,pitch,,


In [44]:
each_pitch_merged = merged_df

In [45]:
each_pitch_merged = each_pitch_merged.rename({'pitch_id_y': 'previous_pitch_in_ab', 'details.type.code_y': 'previous_pitch_code'}, axis=1)

In [46]:
each_pitch_merged.head(2)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.ballColor,details.call.code,details.call.description,details.code,details.description,details.type.code_x,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_id_x,prior_pitch,result.eventType,type,previous_pitch_in_ab,previous_pitch_code
0,0,top,1,0.0,0.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),X,"In play, out(s)",FF,Four-Seam Fastball,567273,0,R,JaCoby Jones,592444,R,Dylan Cease,656302,Empty,,,1.0,567273_0_1,567273_0_0,field_out,pitch,,
1,1,top,1,1.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FF,Four-Seam Fastball,567273,0,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,1.0,567273_1_2,567273_1_1,field_out,pitch,,


In [47]:
each_pitch_clean = each_pitch_merged.drop(['result.eventType', 'type', 'pitch_id_x', 'previous_pitch_in_ab', 'prior_pitch', 'details.ballColor'], axis=1)

In [48]:
each_pitch_clean.head(2)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.code,details.description,details.type.code_x,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code
0,0,top,1,0.0,0.0,X,Hit Into Play - Out(s),X,"In play, out(s)",FF,Four-Seam Fastball,567273,0,R,JaCoby Jones,592444,R,Dylan Cease,656302,Empty,,,1.0,
1,1,top,1,1.0,0.0,B,Ball - Called,B,Ball,FF,Four-Seam Fastball,567273,0,L,Christin Stewart,621514,R,Dylan Cease,656302,Empty,,,1.0,


## Pitch Modification 
There are three types of pitches(fastballs, breaking balls, and change-ups). 
Hitters really only need to know which one of these types will be coming next. I think by narrowing the options, it will help the model make better predictions. 

In [49]:
pitch_dict = {'FF': 'Fastball'}

In [50]:
pitch_dict['FT'] = 'Fastball'
pitch_dict['FC']= 'Fastball'
pitch_dict['FS'] = 'Fastball'
pitch_dict['CH'] = 'Changeup'
pitch_dict['SI'] = 'Fastball'
pitch_dict['FT'] = 'Fastball'
pitch_dict['CU'] = 'Breaking_Ball'
pitch_dict['SL'] = 'Breaking_Ball'
pitch_dict['KC'] = 'Breaking_Ball'
pitch_dict['nan'] = 'NA'

In [51]:
each_pitch_clean['pitch_type'] = each_pitch_clean['details.type.code_x'].map(pitch_dict)

In [52]:
each_pitch_clean['prior_pitch_type'] = each_pitch_clean['previous_pitch_code'].map(pitch_dict)

In [53]:
each_pitch_clean = each_pitch_clean.drop(['details.type.code_x', 'details.type.description', 'details.code', 'gamepk', 'index', 'matchup.batter.id'],axis=1)

In [54]:
each_pitch_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 199
Data columns (total 20 columns):
about.atBatIndex            200 non-null int64
about.halfInning            200 non-null object
about.inning                200 non-null int64
count.balls                 197 non-null float64
count.strikes               197 non-null float64
details.call.code           189 non-null object
details.call.description    189 non-null object
details.description         200 non-null object
matchup.batSide.code        200 non-null object
matchup.batter.fullName     200 non-null object
matchup.pitchHand.code      200 non-null object
matchup.pitcher.fullName    200 non-null object
matchup.pitcher.id          200 non-null int64
matchup.splits.menOnBase    200 non-null object
pitchData.nastyFactor       0 non-null float64
pitchData.zone              0 non-null float64
pitchNumber                 189 non-null float64
previous_pitch_code         143 non-null object
pitch_type                  189 n

In [221]:
each_pitch_clean.to_csv(r'public_data/standard_pitch.csv', index=False, sep=',', encoding='utf-8')

In [55]:
each_pitch_clean.to_csv(r'public_data/tigers.csv', index=False, sep=',', encoding='utf-8')

## Pipeline Preparation

In [128]:
pitch_clean = each_pitch_clean.dropna().copy()

In [129]:
# Lost about 600, but this is just a test
pitch_clean.head(2)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,matchup.batter.fullName,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
2,1,top,1,2.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,FT,Fastball,Fastball
3,1,top,1,2.0,1.0,S,Strike - Swinging,Swinging Strike,R,Kris Bryant,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,FT,Fastball,Fastball


In [130]:
target = pitch_clean['pitch_type']

In [184]:
predictors = pitch_clean.drop(['pitch_type', 'matchup.pitcher.fullName', 'matchup.batter.fullName'], axis=1)

In [185]:
numerical_values = list(predictors['pitchNumber'])

In [186]:
num_features = list(predictors.select_dtypes(exclude='object'))

In [187]:
numeric_transformer = Pipeline(steps=[('keeper', None)])

In [188]:
categorical_value_df = predictors.drop(['pitchNumber'], axis=1)

In [189]:
categorial_values = list(categorical_value_df)

In [190]:
cat_features = list(predictors.select_dtypes(include='object'))

In [191]:
cat_transfomer = Pipeline(steps=[('onehot', OneHotEncoder())])

In [192]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                              ('cat', cat_transfomer, cat_features)])

In [193]:
classifiers = [DecisionTreeClassifier(), RandomForestClassifier(n_estimators=100, max_depth=5), 
               GradientBoostingClassifier(n_estimators=100), xgb.XGBClassifier()]

## Model Creation

In [194]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=10)

In [197]:
#X_test[X_test['matchup.batter.fullName'].str.contains("Lucas Sims")==True]

In [196]:
for classifier in classifiers:
    #Intialize classifier pipeline
    clf1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', classifier)])
    clf1.fit(X_train, y_train)
    one_hot_names = list(clf1.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names())
    final_feats = num_features + one_hot_names
    # Print accuracy metrics for each model using pitch_functions.py
    print(classifier)
    print('\n')
    print('Training Metrics')
    pitch_functions.calc_acc_and_f1_score(y_train, clf1.predict(X_train))
    print('\n')
    print('Testing Metrics')
    pitch_functions.calc_acc_and_f1_score(y_test, clf1.predict(X_test))
    print('\n')
    

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


Training Metrics
Accuracy:1.000
F1-Score: 1.000
AUC: 1.000


Testing Metrics
Accuracy:0.627
F1-Score: 0.629
AUC: 0.646


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Training Metrics
Accuracy:0.648
F1-Score: 0.547
AUC: 0.544


Testing Metrics
Accuracy:0.652
F1-Score: 0