In [1]:
import json
import requests
import pandas as pd

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
# set the default plot size
pylab.rcParams['figure.figsize'] = (12, 6)

# Query API
It's a good idea to write out the raw source data received from querying the API before doing anything else. This is so that we can experiment with the data recieved without having to continually query the API. I will typically load this raw source data into a database as a JSON object, but in this situation, I'll just write it out to a file in JSON format.

In [4]:
# get the information for all player propects
data = requests.get("http://m.mlb.com/gen/players/prospects/2018/playerProspects.json").json()

In [5]:
# write JSON data
with open('2018_mlb_prospect_teams.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False)

# Extract, Transform, Load
After querying the API, the next step is to extract relevant information from the JSON data and load it into a format that we can use for analysis.

In [6]:
# read in JSON data
teams = json.load(open('2018_mlb_prospect_teams.json'))

In [7]:
# inspect the keys in the data
teams.keys()

dict_keys(['year', 'last_updated', 'prospect_players'])

In [8]:
# inspect the teams
print("Last Updated: {}".format(teams['last_updated']))
print("Year: {}".format(teams['year']))

Last Updated: 2018-03-12T20:26:10.958Z
Year: 2018


In [9]:
# look at the keys
teams['prospect_players'].keys()

dict_keys(['sd', 'kc', 'cin', 'phi', 'min', 'rhp', 'ss', 'ari', 'draft', '2b', '3b', 'lhp', 'nym', 'tex', 'la', 'nyy', 'cle', 'col', 'hou', 'stl', 'prospects', 'tor', 'was', 'bos', 'tb', 'bal', 'pit', 'c', 'mil', 'chc', 'ana', '1b', 'atl', 'cws', 'oak', 'det', 'of', 'sea', 'mia', 'sf'])

In [10]:
# look at the keys for the first player of a team
teams['prospect_players']['bos'][0]

{'photo180x218': '/assets/images/7/5/8/264702758/cuts/180x218/cut.jpg',
 'photo360x436': '/assets/images/7/5/8/264702758/cuts/360x436/cut.jpg',
 'player_first_name': 'Michael',
 'player_id': 656308,
 'player_last_name': 'Chavis',
 'position': '3B',
 'preseason100': '',
 'preseason20': 1,
 'prospect_year': '2018',
 'rank': 1,
 'team_file_code': 'BOS',
 'thumb': '/assets/images/7/5/8/264702758/cuts/74x74/cut.jpg',
 'thumb124x150': '/assets/images/7/5/8/264702758/cuts/124x150/cut.jpg',
 'thumb62x75': '/assets/images/7/5/8/264702758/cuts/62x75/cut.jpg'}

In [11]:
# iterate over prospects players and pull out data
players = []
for team in teams['prospect_players']:
    players.extend(teams['prospect_players'][team])

In [12]:
# count how many players there are
len(players)

1130

In [13]:
# look at one player
players[0]

{'photo180x218': '/assets/images/3/0/4/264409304/cuts/180x218/cut.jpg',
 'photo360x436': '/assets/images/3/0/4/264409304/cuts/360x436/cut.jpg',
 'player_first_name': 'Fernando',
 'player_id': 665487,
 'player_last_name': 'Tatis Jr.',
 'position': 'SS',
 'preseason100': 8,
 'preseason20': 1,
 'prospect_year': '2018',
 'rank': 1,
 'team_file_code': 'SD',
 'thumb': '/assets/images/3/0/4/264409304/cuts/74x74/cut.jpg',
 'thumb124x150': '/assets/images/3/0/4/264409304/cuts/124x150/cut.jpg',
 'thumb62x75': '/assets/images/3/0/4/264409304/cuts/62x75/cut.jpg'}

In [14]:
# load players into DataFrame
df = pd.DataFrame(players)

In [15]:
# inspect the DataFrame
df.head()

Unnamed: 0,photo180x218,photo360x436,player_first_name,player_id,player_last_name,position,preseason100,preseason20,prospect_year,rank,team_file_code,thumb,thumb124x150,thumb62x75
0,/assets/images/3/0/4/264409304/cuts/180x218/cu...,/assets/images/3/0/4/264409304/cuts/360x436/cu...,Fernando,665487,Tatis Jr.,SS,8,1,2018,1,SD,/assets/images/3/0/4/264409304/cuts/74x74/cut.jpg,/assets/images/3/0/4/264409304/cuts/124x150/cu...,/assets/images/3/0/4/264409304/cuts/62x75/cut.jpg
1,/assets/images/8/5/6/264406856/cuts/180x218/cu...,/assets/images/8/5/6/264406856/cuts/360x436/cu...,MacKenzie,669022,Gore,LHP,19,2,2018,2,SD,/assets/images/8/5/6/264406856/cuts/74x74/cut.jpg,/assets/images/8/5/6/264406856/cuts/124x150/cu...,/assets/images/8/5/6/264406856/cuts/62x75/cut.jpg
2,/assets/images/7/4/8/264414748/cuts/180x218/cu...,/assets/images/7/4/8/264414748/cuts/360x436/cu...,Luis,649966,Urias,2B/SS,36,3,2018,3,SD,/assets/images/7/4/8/264414748/cuts/74x74/cut.jpg,/assets/images/7/4/8/264414748/cuts/124x150/cu...,/assets/images/7/4/8/264414748/cuts/62x75/cut.jpg
3,/assets/images/0/8/0/264413080/cuts/180x218/cu...,/assets/images/0/8/0/264413080/cuts/360x436/cu...,Cal,615698,Quantrill,RHP,40,4,2018,4,SD,/assets/images/0/8/0/264413080/cuts/74x74/cut.jpg,/assets/images/0/8/0/264413080/cuts/124x150/cu...,/assets/images/0/8/0/264413080/cuts/62x75/cut.jpg
4,/assets/images/3/0/6/264411306/cuts/180x218/cu...,/assets/images/3/0/6/264411306/cuts/360x436/cu...,Michel,673258,Baez,RHP,42,5,2018,5,SD,/assets/images/3/0/6/264411306/cuts/74x74/cut.jpg,/assets/images/3/0/6/264411306/cuts/124x150/cu...,/assets/images/3/0/6/264411306/cuts/62x75/cut.jpg


In [16]:
# define unneeded columns to drop
cols = ['photo180x218', 'photo360x436', 'thumb', 'thumb124x150', 'thumb62x75']

# drop columns
df.drop(cols, axis=1, inplace=True)

In [17]:
# inspect the DataFrame
df.head()

Unnamed: 0,player_first_name,player_id,player_last_name,position,preseason100,preseason20,prospect_year,rank,team_file_code
0,Fernando,665487,Tatis Jr.,SS,8,1,2018,1,SD
1,MacKenzie,669022,Gore,LHP,19,2,2018,2,SD
2,Luis,649966,Urias,2B/SS,36,3,2018,3,SD
3,Cal,615698,Quantrill,RHP,40,4,2018,4,SD
4,Michel,673258,Baez,RHP,42,5,2018,5,SD


# Export Data

In [18]:
# write players to CSV
df.to_csv('2018_mlb_prospect_teams.csv', index=False)