## Data Frame Creation: Complete cleaning and creation of final datasets which are exported as CSV.

In [4]:
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [5]:
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
schedule = statsapi.schedule(start_date="03/28/2018", end_date="10/03/2018")

In [8]:
full = json_normalize(schedule)
gamepks= full['game_id']

In [9]:
gamepks_2018 = list(gamepks.unique())
len(gamepks_2018)

2435

In [10]:
test_pk = gamepks_2018

In [11]:
list_for_final_df = []
for game in test_pk:
    curr_game = statsapi.get('game_playByPlay',{'gamePk':game})
    curr_plays = curr_game.get('allPlays')
    curr_plays_df = pd.DataFrame(curr_plays)
    curr_plays_norm = json_normalize(curr_plays)
    
    all_plays_cols = ['about.atBatIndex', 'about.halfInning', 'about.inning', 'count.balls', 'count.strikes', 'matchup.batSide.code', 
                     'matchup.batter.fullName', 'matchup.batter.id', 'matchup.pitchHand.code', 'matchup.splits.menOnBase', 'matchup.pitcher.fullName',
                     'matchup.pitcher.id', 'result.eventType']
    
    play_events_cols = ['count.balls', 'count.strikes', 'details.ballColor', 'details.call.code', 'details.call.description', 'details.type.description'
                        ,'details.call.code', 'details.description', 'details.code', 'details.type.code', 'index', 'pitchData.nastyFactor',
                       'pitchData.zone', 'pitchNumber', 'type']
    i = 1
    for index, row in curr_plays_norm.iterrows():
            play_events = json_normalize(row['playEvents'])
            
            for play_events_idx, play_events_row in play_events.iterrows():
                
                game_dict = {}
                game_dict['gamepk'] = game
                game_dict['pitch_id']  = str(game) + '_' + str(row['about.atBatIndex']) + '_' + str(i)
                game_dict['prior_pitch'] = str(game) + '_' + (str(row['about.atBatIndex']) + '_' + str(i - 1))
                
                
                for col_all_plays in all_plays_cols:
                    if col_all_plays in curr_plays_norm.columns:
                        game_dict[col_all_plays] = row[col_all_plays]
                    else:
                        game_dict[col_all_plays] = np.nan
                for col_play_events in play_events_cols:
                    if col_play_events in play_events.columns:
                        game_dict[col_play_events] = play_events_row[col_play_events]
                    else: 
                        game_dict[col_play_events] = np.nan
                
                list_for_final_df.append(game_dict)
                i += 1
                                                              
                                                              
                
                                                            
            
            

        
    


In [14]:
each_pitch = pd.DataFrame(list_for_final_df)

In [15]:
pitch_id_df = each_pitch[['pitch_id', 'details.type.code']].copy()

In [16]:
merged_df = pd.merge(each_pitch, pitch_id_df,how='left', left_on='prior_pitch', right_on='pitch_id')

In [17]:
each_pitch_merged = merged_df

In [18]:
each_pitch_merged = each_pitch_merged.rename({'pitch_id_y': 'previous_pitch_in_ab', 'details.type.code_y': 'previous_pitch_code'}, axis=1)

In [19]:
each_pitch_clean = each_pitch_merged.drop(['result.eventType', 'type', 'pitch_id_x', 'previous_pitch_in_ab', 'prior_pitch', 'details.ballColor'], axis=1)

In [20]:
pitch_dict = {'FF': 'Fastball'}

In [21]:
pitch_dict['FT'] = 'Fastball'
pitch_dict['FC']= 'Fastball'
pitch_dict['FS'] = 'Fastball'
pitch_dict['CH'] = 'Changeup'
pitch_dict['SI'] = 'Fastball'
pitch_dict['FT'] = 'Fastball'
pitch_dict['CU'] = 'Breaking_Ball'
pitch_dict['SL'] = 'Breaking_Ball'
pitch_dict['KC'] = 'Breaking_Ball'
pitch_dict['nan'] = 'NA'

In [22]:
each_pitch_clean['pitch_type'] = each_pitch_clean['details.type.code_x'].map(pitch_dict)

In [23]:
each_pitch_clean['prior_pitch_type'] = each_pitch_clean['previous_pitch_code'].map(pitch_dict)

In [24]:
each_pitch_clean = each_pitch_clean.drop(['details.type.code_x', 'details.type.description', 'details.code', 'gamepk', 'index', 'matchup.batter.id'],axis=1)

#### Read in dataframes used for merge

In [26]:
hitter_df = pd.read_csv('public_data/hitter_data.csv')

In [27]:
pitcher_df = pd.read_csv('public_data/pitcher_data.csv')

In [28]:
main_df = each_pitch_clean

In [29]:
hitter_df.head(5)

Unnamed: 0,RK,PLAYER,TEAM,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,AVG,OBP,SLG,OPS,WAR
0,1.0,Kyle Gibson,MIN,2,2,2,0,0,0,0,0,0,0,0,1.0,1.0,1.0,2.0,0.1
1,,Enny Romero,KC/WSH/PIT,1,1,1,1,0,0,0,0,0,0,0,1.0,1.0,2.0,3.0,0.1
2,,Vidal Nuno,TB,2,0,2,0,0,0,1,0,0,0,0,1.0,1.0,1.0,2.0,0.1
3,,Derek Law,SF,1,1,1,0,0,0,0,0,0,0,0,1.0,1.0,1.0,2.0,0.1
4,,Randy Rosario,CHC,1,1,1,0,0,0,1,0,0,1,0,1.0,1.0,1.0,2.0,0.1


In [30]:
hitter_df = hitter_df[['PLAYER', 'SLG', 'OPS', 'WAR']]
hitter_df = hitter_df.rename(columns={'PLAYER': 'hitter'})
hitter_df.head(2)

Unnamed: 0,hitter,SLG,OPS,WAR
0,Kyle Gibson,1.0,2.0,0.1
1,Enny Romero,2.0,3.0,0.1


In [31]:
pitcher_df.head(5)

Unnamed: 0,RK,PLAYER,TEAM,GP,GS,IP,H,R,ER,BB,SO,W,L,SV,BLSV,WAR,WHIP,ERA
0,1.0,Kendrys Morales,TOR,1,0,1.0,0,0,0,1,0,0,0,0,0,0.0,1.0,0.0
1,,Mark Reynolds,WSH,1,0,0.1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2,,Pablo Sandoval,SF,1,0,1.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
3,,Danny Valencia,BAL,1,0,0.1,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0
4,,Alex Avila,ARI,1,0,2.0,1,0,0,0,0,0,0,0,0,0.1,0.5,0.0


In [32]:
pitcher_df = pitcher_df[['PLAYER', 'WAR', 'WHIP', 'ERA', 'SO']]
pitcher_df = pitcher_df.rename(columns={'PLAYER': 'pitcher'})
pitcher_df.head(2)

Unnamed: 0,pitcher,WAR,WHIP,ERA,SO
0,Kendrys Morales,0.0,1.0,0.0,0
1,Mark Reynolds,0.0,0.0,0.0,0


In [33]:
main_df.head(5)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,matchup.batter.fullName,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
0,0,top,1,0.0,0.0,X,Hit Into Play - Out(s),"In play, run(s)",L,Ian Happ,R,Jose Urena,570632,Empty,32.89,6.0,1.0,,Fastball,
1,1,top,1,1.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,24.17,13.0,1.0,,Fastball,
2,1,top,1,2.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,FT,Fastball,Fastball
3,1,top,1,2.0,1.0,S,Strike - Swinging,Swinging Strike,R,Kris Bryant,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,FT,Fastball,Fastball
4,1,top,1,3.0,1.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,59.33,13.0,4.0,FT,Changeup,Fastball


In [34]:
main_df = main_df.rename(columns={'matchup.batter.fullName': 'hitter', 'matchup.pitcher.fullName': 'pitcher'})

In [35]:
merged = pd.merge(hitter_df, main_df, on='hitter')

In [36]:
full_merge = pd.merge(pitcher_df, merged, on='pitcher')

In [37]:
full_merge.head(10)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,1.0,0.0,B,Ball - Called,Ball,R,R,434778,Men_On,34.47,14.0,1.0,,Changeup,
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,1.0,1.0,S,Strike - Swinging,Foul,R,R,434778,Men_On,32.94,1.0,2.0,CH,Changeup,Changeup
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,1.0,2.0,S,Strike - Swinging,Foul,R,R,434778,Men_On,31.44,4.0,3.0,CH,Breaking_Ball,Changeup
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,2.0,2.0,B,Ball - Called,Ball,R,R,434778,Men_On,2.66,14.0,4.0,SL,Breaking_Ball,Breaking_Ball
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,3.0,2.0,B,Ball - Called,Ball,R,R,434778,Men_On,3.82,14.0,5.0,CU,Breaking_Ball,Breaking_Ball
5,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,4.0,2.0,B,Ball - Called,Ball,R,R,434778,Men_On,35.64,11.0,6.0,CU,Breaking_Ball,Breaking_Ball
6,Kendrys Morales,0.0,1.0,0.0,0,Stephen Piscotty,0.491,0.821,2.8,71,top,9,0.0,0.0,X,Hit Into Play - Out(s),"In play, out(s)",R,R,434778,Men_On,75.87,12.0,1.0,,Breaking_Ball,
7,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,0.0,0.0,,,Pitching Change: Kendrys Morales replaces Deck...,L,R,434778,Empty,,,,,,
8,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,1.0,0.0,B,Ball - Called,Ball,L,R,434778,Empty,69.03,14.0,1.0,,Breaking_Ball,
9,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,2.0,0.0,B,Ball - Called,Ball,L,R,434778,Empty,10.87,11.0,2.0,SL,Changeup,Breaking_Ball


In [82]:
#full_merge.to_csv(r'public_data/master.csv', index=False, sep=',', encoding='utf-8')

In [38]:
add_feats = full_merge

In [39]:
add_feats['count.balls'] = add_feats['count.balls'].astype(str)

In [40]:
add_feats['count.strikes'] = add_feats['count.strikes'].astype(str)

In [42]:
add_feats['count'] = add_feats['count.balls'] + '-' + add_feats['count.strikes'] 

In [43]:
add_feats = add_feats.drop([ 'previous_pitch_code', 'details.call.code', 'count.balls', 'count.strikes'], axis=1)

In [44]:
final_pitches = add_feats

In [45]:
final_pitches

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_type,prior_pitch_type,count
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,Ball - Called,Ball,R,R,434778,Men_On,34.47,14.0,1.0,Changeup,,1.0-0.0
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,Strike - Swinging,Foul,R,R,434778,Men_On,32.94,1.0,2.0,Changeup,Changeup,1.0-1.0
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,Strike - Swinging,Foul,R,R,434778,Men_On,31.44,4.0,3.0,Breaking_Ball,Changeup,1.0-2.0
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,Ball - Called,Ball,R,R,434778,Men_On,2.66,14.0,4.0,Breaking_Ball,Breaking_Ball,2.0-2.0
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,Ball - Called,Ball,R,R,434778,Men_On,3.82,14.0,5.0,Breaking_Ball,Breaking_Ball,3.0-2.0
5,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,Ball - Called,Ball,R,R,434778,Men_On,35.64,11.0,6.0,Breaking_Ball,Breaking_Ball,4.0-2.0
6,Kendrys Morales,0.0,1.0,0.0,0,Stephen Piscotty,0.491,0.821,2.8,71,top,9,Hit Into Play - Out(s),"In play, out(s)",R,R,434778,Men_On,75.87,12.0,1.0,Breaking_Ball,,0.0-0.0
7,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,,Pitching Change: Kendrys Morales replaces Deck...,L,R,434778,Empty,,,,,,0.0-0.0
8,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,Ball - Called,Ball,L,R,434778,Empty,69.03,14.0,1.0,Breaking_Ball,,1.0-0.0
9,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,Ball - Called,Ball,L,R,434778,Empty,10.87,11.0,2.0,Changeup,Breaking_Ball,2.0-0.0
