In [1]:
#DATA WRANGLING
import pandas as pd # Dataframes
from pandas.io.json import json_normalize # JSON wrangler
import statsapi # Python wrapper MLB data API

In [2]:
#DATA STORAGE
#from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [3]:
#DATA MANIPULATION AND MODELLING
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os

In [4]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [29]:
statsapi.schedule

[0;31mSignature:[0m
[0mstatsapi[0m[0;34m.[0m[0mschedule[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m[[0m[0;34m'date=None'[0m[0;34m,[0m [0;34m'start_date=None'[0m[0;34m,[0m [0;34m'end_date=None'[0m[0;34m,[0m [0;34m"team=''"[0m[0;34m,[0m [0;34m"opponent=''"[0m[0;34m,[0m [0;34m'sportId=1'[0m[0;34m,[0m [0;34m'game_id=None'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Get list of games for a given date/range and/or team/opponent.

Include a game_id to get data for that game.

Output will be a list containing a dict for each game. Fields in the dict:

'game_id': unique MLB game id (primary key, or gamePk)
'game_datetime': date and timestamp in UTC (be careful if you truncate the time--the date may be the next day for a late game)
'game_date': date of game (YYYY-MM-DD)
'game_type': Preseason, Regular season, Postseason, etc. Look up possible values using the meta endpoint with type=gameType

In [32]:
team_list = ['108','136']

## Data Extraction
Use the StatsApi**inseert hyperlink** to collect infomation about all games played between start and ending of the season(use actual) dates. We need to collect .json file per mlb team, so I used a for loop will load and write out the schedule data as a .json file. 


Importing team codes from TSV file in Public Data direcrtory. Then edit the team names so that they are lowercase and seperated by an underscore. 

In [82]:
teams = pd.read_csv('public_data/team_codes.tsv', sep='\t')

teams['full_name'] = teams['full_name'].str.lower().str.replace(' ', '_').str.replace('-','_')

In [83]:
teams
    

Unnamed: 0,code,short_name,full_name
0,108,LAA,angels
1,109,ARI,d_backs
2,110,BAL,orioles
3,111,BOS,red_sox
4,112,CHC,cubs
5,113,CIN,reds
6,114,CLE,indians
7,115,COL,rockies
8,116,DET,tigers
9,117,HOU,astros


In [69]:
mascots = []

for code, team_name in zip(teams['code'],teams['full_name']):
    print(f'The {team_name} has code {code}')

The Angels has code 108
The D-backs has code 109
The Orioles has code 110
The Red Sox has code 111
The Cubs has code 112
The Reds has code 113
The Indians has code 114
The Rockies has code 115
The Tigers has code 116
The Astros has code 117
The Royals has code 118
The Dodgers has code 119
The Nationals has code 120
The Mets has code 121
The Athletics has code 133
The Pirates has code 134
The Padres has code 135
The Mariners has code 136
The Giants has code 137
The Cardinals has code 138
The Rays has code 139
The Rangers has code 140
The Blue Jays has code 141
The Twins has code 142
The Phillies has code 143
The Braves has code 144
The White Sox has code 145
The Marlins has code 146
The Yankees has code 147
The Brewers has code 158


In [None]:
for team in team_:
    schedule = statsapi.schedule(start_date="03/28/2018", end_date="05/01/2018", team=)


In [36]:
full = json_normalize(schedule)
gamepks= full['game_id']

ValueError: Request failed. Status Code: 400.

In [35]:
gamepks_2018 = list(gamepks.unique())
len(gamepks_2018)

30

In [21]:
# Get one game from API
list_for_new_df = []
#gamepks = [566389]
for game in gamepks_2018:
    #print(game)
    curr_game = statsapi.get('game_playByPlay',{'gamePk':game})

    ### 3. Extract play-by-play data and store into dataframe.

    # Only care about the allPlays key 
    curr_plays = curr_game.get('allPlays')

    # Coerce all plays into a df
    curr_plays_df = json_normalize(curr_plays)

    ###################################
    # Build target table
    ###################################


    # Data from allPlays
    ap_sel_cols = ['about.atBatIndex', 'matchup.batSide.code', 'matchup.pitchHand.code', 'count.balls'
              ,'count.strikes', 'count.outs']

    # Data from playEvents
    plev_sel_cols = ['details.type.code', 'details.type.description', 
            'details.call.code', 'details.call.description', 
            'details.isBall', 'isPitch', 'details.isStrike'
            ,'pitchData.breaks.breakAngle'
            ,'pitchData.breaks.breakLength', 'pitchData.breaks.breakY'
            ,'pitchData.breaks.spinDirection', 'pitchData.breaks.spinRate'
            ,'pitchData.coordinates.aX'
            , 'pitchData.coordinates.aY','pitchData.coordinates.aZ', 'pitchData.coordinates.pX'
            , 'pitchData.coordinates.pZ', 'pitchData.coordinates.pfxX', 'pitchData.coordinates.pfxZ'
            , 'pitchData.coordinates.vX0', 'pitchData.coordinates.vY0', 'pitchData.coordinates.vZ0'
            , 'pitchData.coordinates.x', 'pitchData.coordinates.x0', 'pitchData.coordinates.y'
            , 'pitchData.coordinates.y0','pitchData.coordinates.z0', 'pitchData.endSpeed'
            , 'pitchData.startSpeed', 'pitchNumber', 'pitchData.zone'
           ]

    # Now go through each row. If there is nested list, json_normalize it
    #for index, row in test_df.head(2).iterrows(): #Just using first 2 rows for testing
    for index, row in curr_plays_df.iterrows(): #Just using first 2 rows for testing

        # saw playEvents is a nested list, so json_normalize it
        play_events_df = json_normalize(row['playEvents'])

        #     # look at runners
        #     runners_df = json_normalize(row['runners'])

        # Loop through THIS NESTED dataframe and NOW build the row for the new df    
        for plev_ind, plev_row in play_events_df.iterrows():

            # Instantiate new dict, which will be a single row in target df
            curr_dict = {}
            curr_dict['game_pk'] = game

            # Loop through each list, adding their respective values to curr_dict
            for col_ap in ap_sel_cols:
                if col_ap in curr_plays_df.columns:
                    curr_dict[col_ap] = row[col_ap]
                else:
                    curr_dict[col_ap] = np.nan
                #print(row['about.atBatIndex'])

            for col_plev in plev_sel_cols:
                if col_plev in play_events_df.columns:
                    curr_dict[col_plev] = plev_row[col_plev]
                else:
                    curr_dict[col_plev] = np.nan

            # collect row dictionary into list
            list_for_new_df.append(curr_dict)

In [24]:
len(list_for_new_df)

9067

In [25]:
pitches_df = pd.DataFrame(list_for_new_df)

In [27]:
pitches_df.head(10)

Unnamed: 0,about.atBatIndex,count.balls,count.outs,count.strikes,details.call.code,details.call.description,details.isBall,details.isStrike,details.type.code,details.type.description,...,pitchData.coordinates.vZ0,pitchData.coordinates.x,pitchData.coordinates.x0,pitchData.coordinates.y,pitchData.coordinates.y0,pitchData.coordinates.z0,pitchData.endSpeed,pitchData.startSpeed,pitchData.zone,pitchNumber
0,0,1,1,0,B,Ball - Called,True,False,FF,Four-Seam Fastball,...,-9.54,146.72,-2.46,216.58,50.0,5.64,82.3,89.6,13.0,1.0
1,0,1,1,0,X,Hit Into Play - Out(s),False,False,SI,Sinker,...,-7.1,117.07,-2.07,200.69,50.0,5.8,83.1,90.6,13.0,2.0
2,1,1,2,0,B,Ball - Called,True,False,SI,Sinker,...,-4.91,156.29,-2.2,174.75,50.0,5.9,83.6,91.4,13.0,1.0
3,1,1,2,0,X,Hit Into Play - Out(s),False,False,SI,Sinker,...,-6.28,132.08,-2.14,198.39,50.0,5.72,82.8,90.9,7.0,2.0
4,2,3,3,2,S,Strike - Swinging,False,True,SI,Sinker,...,-5.99,113.29,-2.07,189.93,50.0,5.63,83.1,90.9,8.0,1.0
5,2,3,3,2,S,Strike - Swinging,False,True,CU,Curveball,...,-1.63,146.76,-2.29,200.34,50.0,5.95,73.4,80.1,13.0,2.0
6,2,3,3,2,B,Ball - Called,True,False,FF,Four-Seam Fastball,...,-5.02,188.63,-2.37,158.59,50.0,5.92,84.3,92.4,11.0,3.0
7,2,3,3,2,B,Ball - Called,True,False,SI,Sinker,...,-7.05,133.81,-2.1,229.01,50.0,5.64,80.3,87.2,13.0,4.0
8,2,3,3,2,B,Ball - Called,True,False,CU,Curveball,...,-3.54,119.28,-2.34,221.9,50.0,5.81,74.4,80.9,13.0,5.0
9,2,3,3,2,S,Strike - Swinging,False,True,CU,Curveball,...,-1.29,100.55,-2.2,197.26,50.0,5.83,74.6,81.1,9.0,6.0
