In [1]:
import pandas as pd
import json
import requests
from collections import Counter
from pprint import pprint

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
# set the default plot size
pylab.rcParams['figure.figsize'] = (12, 6)

# Get Players to Query
We will get the unique player ids to query from 2018_mlb_prospect_players.csv

In [4]:
# load team data
teams = pd.read_csv('2018_mlb_prospect_teams.csv')

In [5]:
# inspect the DataFrame
teams.head()

Unnamed: 0,player_first_name,player_id,player_last_name,position,preseason100,preseason20,prospect_year,rank,team_file_code
0,Fernando,665487,Tatis Jr.,SS,8,1.0,2018,1,SD
1,MacKenzie,669022,Gore,LHP,19,2.0,2018,2,SD
2,Luis,649966,Urias,2B/SS,36,3.0,2018,3,SD
3,Cal,615698,Quantrill,RHP,40,4.0,2018,4,SD
4,Michel,673258,Baez,RHP,42,5.0,2018,5,SD


In [6]:
# are some players listed multiple times?
teams['player_id'].value_counts()[:5]

660271    4
656713    4
656605    3
641857    3
642715    3
Name: player_id, dtype: int64

In [7]:
# lets look at one player that is listed multiple times
teams[teams['player_id'] == '656713']

Unnamed: 0,player_first_name,player_id,player_last_name,position,preseason100,preseason20,prospect_year,rank,team_file_code
274,Brendan,656713,McKay,1B/LHP,25.0,,2018,5,TB
544,Brendan,656713,McKay,1B/LHP,25.0,3.0,2018,25,TB
712,Brendan,656713,McKay,1B/LHP,25.0,3.0,2018,3,TB
900,Brendan,656713,McKay,1B/LHP,,,2018,1,TB


In [8]:
# grab the unique player_id from the DataFrame
player_ids = teams['player_id'].unique().tolist()

In [9]:
# how many unique players are there?
len(player_ids)

950

# Query API
It's a good idea to write out the raw source data received from querying the API before doing anything else. This is so that we can experiment with the data recieved without having to continually query the API. I will typically load this raw source data into a database as a JSON object, but in this situation, I'll just write it out to a file in JSON format.

In [10]:
# get data for each player
responses = []
for i in player_ids:
    url = "http://m.mlb.com/gen/players/prospects/2018/{}.json".format(i)
    data = requests.get(url).json()
    responses.append(data)

In [11]:
# write JSON data
with open('2018_mlb_prospect_players.json', 'w') as f:
    json.dump(responses, f, ensure_ascii=False)

# Extract, Transform, Load
After querying the API, the next step is to extract relevant information from the JSON data and load it into a format that we can use for analysis.

In [12]:
# read in JSON data
responses = json.load(open('2018_mlb_prospect_players.json'))

In [13]:
# extract the content from each response
player = []
for line in responses:
    player.append(line['prospect_player'])

In [14]:
# look at the dictionary keys in the first record
player[0].keys()

dict_keys(['drafted', 'preseason20', 'year', 'school', 'bats', 'preseason100', 'birthdate', 'player_last_name', 'height', 'positions', 'thrw', 'weight', 'player_first_name', 'twitter', 'content', 'eta', 'signed', 'team_file_code', 'player_id'])

In [15]:
# inspect the first record
pprint(player[0])

{'bats': '',
 'birthdate': '',
 'content': {'default': '<b>Scouting grades:</b> Hit: 55 | Power: 60 | Run: 50 '
                        '| Arm: 60 | Field: 55 | Overall: 65\n'
                        '<p>The son of former 11-year Major Leaguer Fernando '
                        "Tatis ranked 30th on MLBPipeline.com's Top 30 "
                        'prospects list for the 2015-16 international period '
                        'before signing for $700,000 with the White Sox, who '
                        'shipped him to the Padres the following June as part '
                        'of the James Shields trade. He made an immediate '
                        'impact in his pro debut, finishing the season as a '
                        '17-year-old in the Class A Short-Season Northwest '
                        'League, and then emerged as a top-flight prospect in '
                        '2017 as he became the first 18-year-old in Midwest '
                        'League history to po

In [16]:
# iterate over data and clean up the content
for line in player:
    
    # remove newline delimiter
    bio = line['content']['default'].replace('\n', '').strip()
    
    # conform scouting grades
    variations = ['<b>Scouting grades:</b>', 
                  '<b>Scouting grades</b>:', 
                  '<b>Scouting grades</b>:', 
                  '<b>Scouting Grades:</b>',
                  '<p>Scouting Grades:</b>',
                  '<p><b>Scouting Grades:</b> Scouting Grades:']
    
    for i in variations:
        if bio.startswith(i):
            bio = bio.replace(i, 'Scouting Grades')
    
    # split the bio on paragraph tag
    bio = bio.split('<p>')
    
    # create a new key to hold bio
    line['bio'] = bio
    
    # delete content
    del line['content']

In [17]:
# inspect the new bio 
pprint(player[0]['bio'])

['Scouting Grades Hit: 55 | Power: 60 | Run: 50 | Arm: 60 | Field: 55 | '
 'Overall: 65',
 'The son of former 11-year Major Leaguer Fernando Tatis ranked 30th on '
 "MLBPipeline.com's Top 30 prospects list for the 2015-16 international period "
 'before signing for $700,000 with the White Sox, who shipped him to the '
 'Padres the following June as part of the James Shields trade. He made an '
 'immediate impact in his pro debut, finishing the season as a 17-year-old in '
 'the Class A Short-Season Northwest League, and then emerged as a top-flight '
 'prospect in 2017 as he became the first 18-year-old in Midwest League '
 'history to post at least 20 home runs and 20 stolen bases. Perhaps even more '
 'impressive was how Tatis Jr. handled himself after making the jump directly '
 'to Double-A San Antonio in August, as the performance further certified his '
 "status as one of baseball's premier prospects. ",
 'Tatis Jr. has all the ingredients to become an offensive force in the '
 '

In [18]:
# iterate over data and extract scouting grades into new keys
for line in player:
    
    # create an empty container to hold non-scouting grades
    bio = []
    
    # iterate over the bio key specifically
    for item in line['bio']:
    
        # get the line that starts with scouting grades
        if item.startswith('Scouting Grades'):

            # remove the beginning of the sententce
            grades = item.replace('Scouting Grades', '')
            
            # address edge case with missing colon
            grades = grades.replace('Arm ', 'Arm:')
            grades = grades.replace('Control ', 'Control:')

            # split in seperate metrics
            metrics = grades.split('|')

            # iterate over the metrics
            for i in metrics:

                # seperate the metric from the value
                value = i.split(':')

                # add metrics and values as new keys/values
                line[value[0].strip()] = value[1].strip()
                
        # if it's not a scouting grade...
        else:
            
            # add to bio
            bio.append(item)
            
    # join the bio text into one string
    line['bio'] = ''.join(bio)

In [19]:
# inspect one record
pprint(player[0])

{'Arm': '60',
 'Field': '55',
 'Hit': '55',
 'Overall': '65',
 'Power': '60',
 'Run': '50',
 'bats': '',
 'bio': 'The son of former 11-year Major Leaguer Fernando Tatis ranked 30th on '
        "MLBPipeline.com's Top 30 prospects list for the 2015-16 international "
        'period before signing for $700,000 with the White Sox, who shipped '
        'him to the Padres the following June as part of the James Shields '
        'trade. He made an immediate impact in his pro debut, finishing the '
        'season as a 17-year-old in the Class A Short-Season Northwest League, '
        'and then emerged as a top-flight prospect in 2017 as he became the '
        'first 18-year-old in Midwest League history to post at least 20 home '
        'runs and 20 stolen bases. Perhaps even more impressive was how Tatis '
        'Jr. handled himself after making the jump directly to Double-A San '
        'Antonio in August, as the performance further certified his status as '
        "one of baseba

In [20]:
# iterate over data convert values to integers where possible
for line in player:
    for key in line:
        try:
            line[key] = int(line[key])
        except:
            pass

In [21]:
# load data into a DataFrame
df = pd.DataFrame(player)

In [22]:
# convert column names to lowercase
df.columns = df.columns.str.lower()

In [23]:
# inspect the DataFrame
df.head()

Unnamed: 0,arm,changeup,control,curve,curveball,cutter,cuverball,defense,fastball,field,...,positions,preseason100,preseason20,school,signed,team_file_code,thrw,twitter,weight,year
0,60.0,,,,,,,,,55.0,...,SS,8,1,,"July 2, 2015 - CWS",SD,,tatis_jr,,
1,,55.0,55.0,,60.0,,,,65.0,,...,LHP,19,2,,,SD,,mgore181,,
2,55.0,,,,,,,,,60.0,...,2B/SS,36,3,,"Dec. 27, 2013 - SD",SD,,luisurias03,,
3,,65.0,55.0,,50.0,,,,60.0,,...,RHP,40,4,,,SD,,cal_quantrill47,,
4,,55.0,50.0,,50.0,,,,70.0,,...,RHP,42,5,,"Dec. 19, 2016 - SD",SD,,BaezCruz1,,


# Export Data

In [24]:
# write the dataframe to CSV
df.to_csv("2018_mlb_prospect_players.csv", index=False)