## Modeling Stage One

In [1]:
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [2]:
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions

In [74]:
pd.set_option('display.max_columns', None)

#### Read in dataframes used for merge

In [46]:
hitter_df = pd.read_csv('public_data/hitter_data.csv')

In [47]:
pitcher_df = pd.read_csv('public_data/pitcher_data.csv')

In [48]:
main_df = pd.read_csv('public_data/standard_pitch.csv')

In [52]:
hitter_df.head(5)

Unnamed: 0,PLAYER,SLG,OPS,WAR
0,Kyle Gibson,1.0,2.0,0.1
1,Enny Romero,2.0,3.0,0.1
2,Vidal Nuno,1.0,2.0,0.1
3,Derek Law,1.0,2.0,0.1
4,Randy Rosario,1.0,2.0,0.1


In [63]:
hitter_df = hitter_df[['PLAYER', 'SLG', 'OPS', 'WAR']]
hitter_df = hitter_df.rename(columns={'PLAYER': 'hitter'})
hitter_df.head(2)

Unnamed: 0,hitter,SLG,OPS,WAR
0,Kyle Gibson,1.0,2.0,0.1
1,Enny Romero,2.0,3.0,0.1


In [20]:
pitcher_df.head(5)

Unnamed: 0,RK,PLAYER,TEAM,GP,GS,IP,H,R,ER,BB,SO,W,L,SV,BLSV,WAR,WHIP,ERA
0,1.0,Kendrys Morales,TOR,1,0,1.0,0,0,0,1,0,0,0,0,0,0.0,1.0,0.0
1,,Mark Reynolds,WSH,1,0,0.1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2,,Pablo Sandoval,SF,1,0,1.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
3,,Danny Valencia,BAL,1,0,0.1,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0
4,,Alex Avila,ARI,1,0,2.0,1,0,0,0,0,0,0,0,0,0.1,0.5,0.0


In [66]:
pitcher_df = pitcher_df[['PLAYER', 'WAR', 'WHIP', 'ERA', 'SO']]
pitcher_df = pitcher_df.rename(columns={'PLAYER': 'pitcher'})
pitcher_df.head(2)

Unnamed: 0,pitcher,WAR,WHIP,ERA,SO
0,Kendrys Morales,0.0,1.0,0.0,0
1,Mark Reynolds,0.0,0.0,0.0,0


In [41]:
main_df.head(5)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,matchup.batter.fullName,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
0,0,top,1,0.0,0.0,X,Hit Into Play - Out(s),"In play, run(s)",L,Ian Happ,R,Jose Urena,570632,Empty,32.89,6.0,1.0,,Fastball,
1,1,top,1,1.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,24.17,13.0,1.0,,Fastball,
2,1,top,1,2.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,FT,Fastball,Fastball
3,1,top,1,2.0,1.0,S,Strike - Swinging,Swinging Strike,R,Kris Bryant,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,FT,Fastball,Fastball
4,1,top,1,3.0,1.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,59.33,13.0,4.0,FT,Changeup,Fastball


In [60]:
main_df = main_df.rename(columns={'matchup.batter.fullName': 'hitter', 'matchup.pitcher.fullName': 'pitcher'})

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,hitter,matchup.pitchHand.code,pitcher,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
0,0,top,1,0.0,0.0,X,Hit Into Play - Out(s),"In play, run(s)",L,Ian Happ,R,Jose Urena,570632,Empty,32.89,6.0,1.0,,Fastball,
1,1,top,1,1.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,24.17,13.0,1.0,,Fastball,
2,1,top,1,2.0,0.0,B,Ball - Called,Ball,R,Kris Bryant,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,FT,Fastball,Fastball
3,1,top,1,2.0,1.0,S,Strike - Swinging,Swinging Strike,R,Kris Bryant,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,FT,Fastball,Fastball


In [68]:
merged = pd.merge(hitter_df, main_df, on='hitter')

In [70]:
full_merge = pd.merge(pitcher_df, merged, on='pitcher')

In [81]:
full_merge.head(10)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.call.code,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,previous_pitch_code,pitch_type,prior_pitch_type
0,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,1.0,0.0,B,Ball - Called,Ball,R,R,434778,Men_On,34.47,14.0,1.0,,Changeup,
1,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,1.0,1.0,S,Strike - Swinging,Foul,R,R,434778,Men_On,32.94,1.0,2.0,CH,Changeup,Changeup
2,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,1.0,2.0,S,Strike - Swinging,Foul,R,R,434778,Men_On,31.44,4.0,3.0,CH,Breaking_Ball,Changeup
3,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,2.0,2.0,B,Ball - Called,Ball,R,R,434778,Men_On,2.66,14.0,4.0,SL,Breaking_Ball,Breaking_Ball
4,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,3.0,2.0,B,Ball - Called,Ball,R,R,434778,Men_On,3.82,14.0,5.0,CU,Breaking_Ball,Breaking_Ball
5,Kendrys Morales,0.0,1.0,0.0,0,Matt Chapman,0.508,0.864,8.2,70,top,9,4.0,2.0,B,Ball - Called,Ball,R,R,434778,Men_On,35.64,11.0,6.0,CU,Breaking_Ball,Breaking_Ball
6,Kendrys Morales,0.0,1.0,0.0,0,Stephen Piscotty,0.491,0.821,2.8,71,top,9,0.0,0.0,X,Hit Into Play - Out(s),"In play, out(s)",R,R,434778,Men_On,75.87,12.0,1.0,,Breaking_Ball,
7,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,0.0,0.0,,,Pitching Change: Kendrys Morales replaces Deck...,L,R,434778,Empty,,,,,,
8,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,1.0,0.0,B,Ball - Called,Ball,L,R,434778,Empty,69.03,14.0,1.0,,Breaking_Ball,
9,Kendrys Morales,0.0,1.0,0.0,0,Matt Olson,0.453,0.788,4.3,69,top,9,2.0,0.0,B,Ball - Called,Ball,L,R,434778,Empty,10.87,11.0,2.0,SL,Changeup,Breaking_Ball


In [82]:
full_merge.to_csv(r'public_data/master.csv', index=False, sep=',', encoding='utf-8')