# Exploratory Data Analysis One
### In this notebook, I will be bringing in data, arranging it so that it can be used properly, and then exporting it to a .json file so I can use it on the cloud. 

#### Importing the necessary libraries

In [1]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [2]:
#DATA STORAGE
#from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [3]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os

In [4]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [5]:
#statsapi.schedule

In [6]:
team_list = ['108','136']

## Data Extraction
Use the StatsApi **insert hyperlink** to collect infomation about all games played between start and ending of the season(use actual) dates. We need to collect .json file per mlb team, so I used a for loop will load and write out the schedule data as a .json file. 


Importing team codes from TSV file in Public Data direcrtory. Then edit the team names so that they are lowercase and seperated by an underscore. 

In [7]:
teams = pd.read_csv('public_data/team_codes.tsv', sep='\t')

teams['full_name'] = teams['full_name'].str.lower().str.replace(' ', '_').str.replace('-','_')

In [8]:
teams.head()

Unnamed: 0,code,short_name,full_name
0,108,LAA,angels
1,109,ARI,d_backs
2,110,BAL,orioles
3,111,BOS,red_sox
4,112,CHC,cubs


In [9]:
for code, team_name in zip(teams['code'],teams['full_name']):
    print(f'The {team_name} has code {code}')

The angels has code 108
The d_backs has code 109
The orioles has code 110
The red_sox has code 111
The cubs has code 112
The reds has code 113
The indians has code 114
The rockies has code 115
The tigers has code 116
The astros has code 117
The royals has code 118
The dodgers has code 119
The nationals has code 120
The mets has code 121
The athletics has code 133
The pirates has code 134
The padres has code 135
The mariners has code 136
The giants has code 137
The cardinals has code 138
The rays has code 139
The rangers has code 140
The blue_jays has code 141
The twins has code 142
The phillies has code 143
The braves has code 144
The white_sox has code 145
The marlins has code 146
The yankees has code 147
The brewers has code 158


In [10]:
schedule = statsapi.schedule(start_date="03/28/2018", end_date="04/01/2018")


In [11]:
full = json_normalize(schedule)
gamepks= full['game_id']

In [12]:
full.head(2)

Unnamed: 0,away_id,away_name,away_pitcher_note,away_probable_pitcher,away_score,current_inning,doubleheader,game_date,game_datetime,game_id,...,home_probable_pitcher,home_score,inning_state,losing_pitcher,losing_team,save_pitcher,status,summary,winning_pitcher,winning_team
0,112,Chicago Cubs,Lester will make his 7th Opening Day start and...,"Lester, Jon",8,9.0,N,2018-03-29,2018-03-29T16:40:00Z,529407,...,"Urena, Jose",4,Bottom,Jose Urena,Miami Marlins,,Final,2018-03-29 - Chicago Cubs (8) @ Miami Marlins ...,Steve Cishek,Chicago Cubs
1,134,Pittsburgh Pirates,Nova will make his first Opening Day start at ...,"Nova, Ivan",0,,N,2018-03-29,2018-03-29T17:10:00Z,529417,...,"Zimmermann, Jordan",0,,,,,Postponed,2018-03-29 - Pittsburgh Pirates @ Detroit Tige...,,


In [13]:
gamepks_2018 = list(gamepks.unique())
len(gamepks_2018)

51

In [14]:
test_pk = gamepks_2018[:6]

In [15]:
pd.set_option('display.max_columns', None)

In [16]:
test_pk

[529407, 529417, 529419, 529414, 529411, 529415]

This for loop is designed to bring in the pitch by pitch data from each game, which is written in .json and it will flatten the .json out in levels. I defined columns names from the original data pulled in and I'm going to use the two flattened levels in order to make one cohesive dataframe with all of the columns that I would like. 

In [17]:
list_for_final_df = []


for game in test_pk:
    print(game)
    curr_game = statsapi.get('game_playByPlay',{'gamePk':game})
    curr_plays = curr_game.get('allPlays')
    curr_plays_df = pd.DataFrame(curr_plays)
    curr_plays_norm = json_normalize(curr_plays)
    
    all_plays_cols = ['about.atBatIndex', 'about.halfInning', 'about.inning', 'count.balls', 'count.strikes', 'matchup.batSide.code', 
                     'matchup.batter.fullName', 'matchup.batter.id', 'matchup.pitchHand.code', 'matchup.splits.menOnBase', 'matchup.pitcher.fullName',
                     'matchup.pitcher.id', 'result.eventType']
    
    play_events_cols = ['count.balls', 'count.strikes', 'details.ballColor', 'details.call.code', 'details.call.description', 'details.type.description'
                        ,'details.call.code', 'details.description', 'details.code', 'details.type.code', 'index', 'pitchData.nastyFactor',
                       'pitchData.zone', 'pitchNumber', 'type']
    i = 1
    for index, row in curr_plays_norm.iterrows():
            play_events = json_normalize(row['playEvents'])
            
            for play_events_idx, play_events_row in play_events.iterrows():
                
                game_dict = {}
                game_dict['gamepk'] = game
                game_dict['pitch_id']  = str(game) + '_' + str(row['about.atBatIndex']) + '_' + str(i)
                game_dict['prior_pitch'] = str(game) + '_' + (str(row['about.atBatIndex']) + '_' + str(i - 1))
                
                
                for col_all_plays in all_plays_cols:
                    if col_all_plays in curr_plays_norm.columns:
                        game_dict[col_all_plays] = row[col_all_plays]
                    else:
                        game_dict[col_all_plays] = np.nan
                for col_play_events in play_events_cols:
                    if col_play_events in play_events.columns:
                        game_dict[col_play_events] = play_events_row[col_play_events]
                    else: 
                        game_dict[col_play_events] = np.nan
                
                list_for_final_df.append(game_dict)
                i += 1
                                                              
                                                              
                
                                                            
            
            

        
    


529407
529417
529419
529414
529411
529415


In [18]:
each_pitch = pd.DataFrame(list_for_final_df)
each_pitch.head(10)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.ballColor,details.call.code,details.call.description,details.code,details.description,details.type.code,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_id,prior_pitch,result.eventType,type
0,0,top,1,0.0,0.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),E,"In play, run(s)",FT,Two-Seam Fastball,529407,0,L,Ian Happ,664023,R,Jose Urena,570632,Empty,32.89,6.0,1.0,529407_0_1,529407_0_0,home_run,pitch
1,1,top,1,1.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,0,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,24.17,13.0,1.0,529407_1_2,529407_1_1,walk,pitch
2,1,top,1,2.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,1,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,529407_1_3,529407_1_2,walk,pitch
3,1,top,1,2.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,S,Swinging Strike,FT,Two-Seam Fastball,529407,2,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,529407_1_4,529407_1_3,walk,pitch
4,1,top,1,3.0,1.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,CH,Changeup,529407,3,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,59.33,13.0,4.0,529407_1_5,529407_1_4,walk,pitch
5,1,top,1,3.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,4,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,39.41,4.0,5.0,529407_1_6,529407_1_5,walk,pitch
6,1,top,1,4.0,2.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,CH,Changeup,529407,5,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,14.88,11.0,6.0,529407_1_7,529407_1_6,walk,pitch
7,2,top,1,0.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,0,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,45.62,9.0,1.0,529407_2_8,529407_2_7,hit_by_pitch,pitch
8,2,top,1,0.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,1,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,31.03,6.0,2.0,529407_2_9,529407_2_8,hit_by_pitch,pitch
9,2,top,1,1.0,2.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,2,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,53.95,12.0,3.0,529407_2_10,529407_2_9,hit_by_pitch,pitch


In [19]:
each_pitch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2183 entries, 0 to 2182
Data columns (total 28 columns):
about.atBatIndex            2183 non-null int64
about.halfInning            2183 non-null object
about.inning                2183 non-null int64
count.balls                 2140 non-null float64
count.strikes               2140 non-null float64
details.ballColor           1998 non-null object
details.call.code           1998 non-null object
details.call.description    1998 non-null object
details.code                2041 non-null object
details.description         2183 non-null object
details.type.code           1979 non-null object
details.type.description    1979 non-null object
gamepk                      2183 non-null int64
index                       2183 non-null int64
matchup.batSide.code        2183 non-null object
matchup.batter.fullName     2183 non-null object
matchup.batter.id           2183 non-null int64
matchup.pitchHand.code      2183 non-null object
matchup.pitche

NEXT STEPS:
- Get previous pitch column figured out
- Scrape June 2018 and Sept 2018 hitting and pitching data
- Match the player stats to the name column in the pitch data
- Modify men on base 
- Drop play result and type column

    

In [23]:
pitch_id_df = each_pitch[['pitch_id', 'details.type.code']].copy()
pitch_id_df.head()

Unnamed: 0,pitch_id,details.type.code
0,529407_0_1,FT
1,529407_1_2,FT
2,529407_1_3,FT
3,529407_1_4,FT
4,529407_1_5,CH


In [24]:
merged_df = pd.merge(each_pitch, pitch_id_df,how='left', left_on='prior_pitch', right_on='pitch_id')

In [26]:
merged_df.head(10)

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.ballColor,details.call.code,details.call.description,details.code,details.description,details.type.code_x,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_id_x,prior_pitch,result.eventType,type,pitch_id_y,details.type.code_y
0,0,top,1,0.0,0.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),E,"In play, run(s)",FT,Two-Seam Fastball,529407,0,L,Ian Happ,664023,R,Jose Urena,570632,Empty,32.89,6.0,1.0,529407_0_1,529407_0_0,home_run,pitch,,
1,1,top,1,1.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,0,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,24.17,13.0,1.0,529407_1_2,529407_1_1,walk,pitch,,
2,1,top,1,2.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,1,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,529407_1_3,529407_1_2,walk,pitch,529407_1_2,FT
3,1,top,1,2.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,S,Swinging Strike,FT,Two-Seam Fastball,529407,2,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,529407_1_4,529407_1_3,walk,pitch,529407_1_3,FT
4,1,top,1,3.0,1.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,CH,Changeup,529407,3,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,59.33,13.0,4.0,529407_1_5,529407_1_4,walk,pitch,529407_1_4,FT
5,1,top,1,3.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,4,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,39.41,4.0,5.0,529407_1_6,529407_1_5,walk,pitch,529407_1_5,CH
6,1,top,1,4.0,2.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,CH,Changeup,529407,5,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,14.88,11.0,6.0,529407_1_7,529407_1_6,walk,pitch,529407_1_6,FT
7,2,top,1,0.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,0,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,45.62,9.0,1.0,529407_2_8,529407_2_7,hit_by_pitch,pitch,,
8,2,top,1,0.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,1,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,31.03,6.0,2.0,529407_2_9,529407_2_8,hit_by_pitch,pitch,529407_2_8,FT
9,2,top,1,1.0,2.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,2,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,53.95,12.0,3.0,529407_2_10,529407_2_9,hit_by_pitch,pitch,529407_2_9,FT


In [27]:
each_pitch_merged = merged_df

In [29]:
each_pitch_merged.rename({'pitch_id_y': 'previous_pitch_in_ab'})

Unnamed: 0,about.atBatIndex,about.halfInning,about.inning,count.balls,count.strikes,details.ballColor,details.call.code,details.call.description,details.code,details.description,details.type.code_x,details.type.description,gamepk,index,matchup.batSide.code,matchup.batter.fullName,matchup.batter.id,matchup.pitchHand.code,matchup.pitcher.fullName,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,pitch_id_x,prior_pitch,result.eventType,type,pitch_id_y,details.type.code_y
0,0,top,1,0.0,0.0,"rgba(26, 86, 190, 1.0)",X,Hit Into Play - Out(s),E,"In play, run(s)",FT,Two-Seam Fastball,529407,0,L,Ian Happ,664023,R,Jose Urena,570632,Empty,32.89,6.0,1.0,529407_0_1,529407_0_0,home_run,pitch,,
1,1,top,1,1.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,0,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,24.17,13.0,1.0,529407_1_2,529407_1_1,walk,pitch,,
2,1,top,1,2.0,0.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,1,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,29.02,13.0,2.0,529407_1_3,529407_1_2,walk,pitch,529407_1_2,FT
3,1,top,1,2.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,S,Swinging Strike,FT,Two-Seam Fastball,529407,2,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,41.63,13.0,3.0,529407_1_4,529407_1_3,walk,pitch,529407_1_3,FT
4,1,top,1,3.0,1.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,CH,Changeup,529407,3,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,59.33,13.0,4.0,529407_1_5,529407_1_4,walk,pitch,529407_1_4,FT
5,1,top,1,3.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,4,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,39.41,4.0,5.0,529407_1_6,529407_1_5,walk,pitch,529407_1_5,CH
6,1,top,1,4.0,2.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,CH,Changeup,529407,5,R,Kris Bryant,592178,R,Jose Urena,570632,Men_On,14.88,11.0,6.0,529407_1_7,529407_1_6,walk,pitch,529407_1_6,FT
7,2,top,1,0.0,1.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,0,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,45.62,9.0,1.0,529407_2_8,529407_2_7,hit_by_pitch,pitch,,
8,2,top,1,0.0,2.0,"rgba(170, 21, 11, 1.0)",S,Strike - Swinging,F,Foul,FT,Two-Seam Fastball,529407,1,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,31.03,6.0,2.0,529407_2_9,529407_2_8,hit_by_pitch,pitch,529407_2_8,FT
9,2,top,1,1.0,2.0,"rgba(39, 161, 39, 1.0)",B,Ball - Called,B,Ball,FT,Two-Seam Fastball,529407,2,L,Anthony Rizzo,519203,R,Jose Urena,570632,RISP,53.95,12.0,3.0,529407_2_10,529407_2_9,hit_by_pitch,pitch,529407_2_9,FT
