# Introduction
This note book creates and saves a dataframe for the Minor League teams. 

# Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import json

# Notebook Setup
Important things to note, I am only collecting data on qulaified players from Triple-A teams. To qualify, a player must have 2.7 plate appearances per team game played. Qulaified players is the default so I do not have to do any special filtering for this.

## Work around 403 error

In [2]:
# a small little ruse to aviod 403 error
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

## Load pickled MLB df to get column names
New MiLB data will need same format as MLB data. 

In [3]:
MLB_columns = pd.read_pickle("../pickled_tables/MLB_df.pkl").columns
MLB_columns

Index(['Team', 'Games Played', 'At Bats', 'Runs', 'Hits', 'Doubles', 'Triples',
       'Home Runs', 'Runs Batted In', 'Walks', 'Strikeouts', 'Stolen Bases',
       'Caught Stealing', 'Batting Average', 'On-Base Percentage',
       'Slugging Percentage', 'On-Base Plus Slugging', 'First Name',
       'Last Name', 'Position'],
      dtype='object')

## URL's
URL's found from inspect > Network. Tabbing through Name. Checking: Header > General > Request URL

1. Get team id numbers from Triple-A division:
 - Name: en-US?&contextTeamId=&contextLeagueId=117 
    - Notice end includes league id: 'LeagueId=117', also see this is tacked on to end of url lised in Request URL
    - Request URL: https://bdfed.stitch.mlbinfra.com/bdfed/milb-stats/default/default/en-US?&contextTeamId=&contextLeagueId=117
      - This is a json dict with website and data information: including all the team names, team id numbers, league id numbers, and league id numbers within each division. 
      - I need this to filter out only teams in Triple-A then get their team id numbers


2. Get information on qulaified players from each team in Triple-A
  - Name: player?stitch_env=prod&season=2022&sportId=11&stat…ePlusSlugging&order=desc&leagueIds=117&teamId=445
    - Notice end includes league AND team ids: 'leagueIds=117&teamId=445' also see this is tacked on to end of url lised in Request URL
    - Request URL: https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=onBasePlusSlugging&order=desc&leagueIds=117&teamId=445
      - This is a json dict for just the team associated with team id 445
      - This url without the team number can act as a base-url to loop through team id numbers to get urls for each team. 

In [4]:
# 1. team id numbers from Triple-A division
triple_a_team_ids_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/milb-stats/default/default/en-US?&contextTeamId=&contextLeagueId=117'

# 2. information on qulaified players from each team in Triple-A
base_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=onBasePlusSlugging&order=desc&leagueIds=117&teamId='


# Get team id numbers

## Convert data to json

In [5]:
# make request
response = requests.get(triple_a_team_ids_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# save entire soup as a string
triple_a_team_ids_soup = str(soup.prettify)

# above step adds some text and tags, these need to be removed
triple_a_team_ids_soup_trimmed = triple_a_team_ids_soup.replace('<bound method Tag.prettify of ', '')[:-1]

# convert the string to json
triple_a_json = json.loads(triple_a_team_ids_soup_trimmed)

# view for visual inspections
triple_a_json.keys()

dict_keys(['view', 'sportIds', 'season', 'seasonType', 'filmroomLinkMin', 'status', 'views', 'tabs', 'filters', 'tables', 'columns', 'playerPool', 'positions', 'timeframe', 'splits', 'seasons', 'gameTypes', 'teams', 'sports', 'leagues', 'sportsAndLeagues', 'seasonTypes', 'defaults', 'slugs', 'ids', 'dictionary', 'pageSize', 'flipData'])

## Inspect json keys to find needed data (Triple-A teams' id's)

- `triple_a_json['sports']` is a list of dictionaries. Each dictionary is a different division. Each division has label (name of division) and id (this is what is needed for next step), as well as league id the division info is in (this could be an alt way of extracting needed info but less efficient). 
 - From here I see the Triple-A division has an id `s11`
 
 
- `triple_a_json['teams']` is a dictionary. The keys are splits of teams, split by the entire MiLB, the divisions, and the leagues. A keys value is a single list. Each list is a list of dictionaries. Each dictionary is a team where the key is the team name and the value the team id number.
 - I only want Triple-A teams: `triple_a_json['teams']['s11']`


In [6]:
# show data 
triple_a_json['sports']

[{'label': 'Triple-A', 'value': 's11', 'id': 's11', 'leagueList': '117,112'},
 {'label': 'Double-A',
  'value': 's12',
  'id': 's12',
  'leagueList': '113,111,109'},
 {'label': 'High-A', 'value': 's13', 'id': 's13', 'leagueList': '116,118,126'},
 {'label': 'Single-A',
  'value': 's14',
  'id': 's14',
  'leagueList': '122,123,110'},
 {'label': 'Rookie', 'value': 's16', 'id': 's16', 'leagueList': '121,124,130'}]

In [8]:
# show data
print(triple_a_json['teams'].keys())

triple_a_json['teams']['s11']

dict_keys(['milb', 's16', 'l121', 'l124', 'l130', 's14', 'l122', 'l123', 'l110', 's13', 'l116', 'l118', 'l126', 's12', 'l113', 'l111', 'l109', 's11', 'l117', 'l112'])


[{'label': 'Albuquerque Isotopes', 'value': 't342'},
 {'label': 'Buffalo Bisons', 'value': 't422'},
 {'label': 'Charlotte Knights', 'value': 't494'},
 {'label': 'Columbus Clippers', 'value': 't445'},
 {'label': 'Durham Bulls', 'value': 't234'},
 {'label': 'El Paso Chihuahuas', 'value': 't4904'},
 {'label': 'Gwinnett Stripers', 'value': 't431'},
 {'label': 'Indianapolis Indians', 'value': 't484'},
 {'label': 'Iowa Cubs', 'value': 't451'},
 {'label': 'Jacksonville Jumbo Shrimp', 'value': 't564'},
 {'label': 'Las Vegas Aviators', 'value': 't400'},
 {'label': 'Lehigh Valley IronPigs', 'value': 't1410'},
 {'label': 'Louisville Bats', 'value': 't416'},
 {'label': 'Memphis Redbirds', 'value': 't235'},
 {'label': 'Nashville Sounds', 'value': 't556'},
 {'label': 'Norfolk Tides', 'value': 't568'},
 {'label': 'Oklahoma City Dodgers', 'value': 't238'},
 {'label': 'Omaha Storm Chasers', 'value': 't541'},
 {'label': 'Reno Aces', 'value': 't2310'},
 {'label': 'Rochester Red Wings', 'value': 't534'},


## Extract out just the team id's and remove 't'

Team id's will be looped through with each being tacted onto the end of `base_url` to get dictionaries of each teams qualified players stats. 


In [9]:
team_id_numbers = []

for team in triple_a_json['teams']['s11']:
    team_id = team['value'].replace('t', '')  # remove 't' so only number
    team_id_numbers.append(team_id)
    
# check
team_id_numbers

['342',
 '422',
 '494',
 '445',
 '234',
 '4904',
 '431',
 '484',
 '451',
 '564',
 '400',
 '1410',
 '416',
 '235',
 '556',
 '568',
 '238',
 '541',
 '2310',
 '534',
 '102',
 '105',
 '561',
 '531',
 '1960',
 '5434',
 '552',
 '529',
 '512',
 '533']

# Create URL's

In [10]:
team_urls = [base_url+i for i in team_id_numbers]

# check 
team_urls

['https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=onBasePlusSlugging&order=desc&leagueIds=117&teamId=342',
 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=onBasePlusSlugging&order=desc&leagueIds=117&teamId=422',
 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=onBasePlusSlugging&order=desc&leagueIds=117&teamId=494',
 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=onBasePlusSlugging&order=desc&leagueIds=117&teamId=445',
 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=11&stats=season&group=hitting&gameType=R&limi

# Create DF's for every Triple-A teams' qualified players
Loop through URL's and convert data into pandas df's

In [11]:
raw_strings = []
raw_strings_trimmed = []
jsons = []
dfs = []

for team_url in team_urls:
    
    response = requests.get(team_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    raw_string = str(soup.prettify)
    raw_strings.append(raw_string) # temporary save to troubleshoot if needed
        
    # remove unneeded bits from front and end
    raw_string = raw_string.replace('<bound method Tag.prettify of ', '')[:-1]
    raw_strings_trimmed.append(raw_string) # temporary save to troubleshoot if needed
    
    # convert string to json - this makes a dictionary
    json_object = json.loads(raw_string)
    jsons.append(json_object) # temporary save to troubleshoot if needed

    # convert 'stats' (it's a list) to a pandas df
    df = pd.DataFrame(json_object['stats'])
    dfs.append(df)

In [12]:
len(dfs) == len(team_id_numbers)

True

In [13]:
# view first
dfs[0]

Unnamed: 0,year,playerId,playerName,type,rank,playerFullName,playerFirstName,playerLastName,playerUseName,playerInitLastName,...,slg,ops,caughtStealing,stolenBases,stolenBasePercentage,groundIntoDoublePlay,rbi,groundOutsToAirouts,catchersInterference,atBatsPerHomeRun
0,2022,592144,Wynton Bernard,player,1,Wynton Bernard,Wynton,Bernard,Wynton,W Bernard,...,0.59,0.977,5,30,.857,7,92,1.55,0,20.43
1,2022,542208,Carlos Perez,player,2,Carlos Perez,Carlos,Perez,Carlos,C PÃ©rez,...,0.524,0.865,0,0,.---,8,87,0.57,0,14.71
2,2022,663796,Coco Montes,player,3,Coco Montes,Robert,Montes,Coco,C Montes,...,0.5,0.859,1,13,.929,11,77,0.68,0,21.0
3,2022,668723,Ryan Vilade,player,4,Ryan Vilade,Ryan,Vilade,Ryan,R Vilade,...,0.352,0.697,6,10,.625,9,38,1.22,0,73.8


# Edit team df's to match data in MLB df

## For easy renaming, put the columns in the same order as MLB columns 

In [14]:
# columns to keep
keep = ['teamAbbrev', 
        'gamesPlayed', 
        'atBats',
        'runs',
        'hits', 
        'doubles',
        'triples', 
        'homeRuns', 
        'rbi',
        'baseOnBalls',
        'strikeOuts',
        'stolenBases', 
        'caughtStealing',
        'avg', 
        'obp', 
        'slg', 
        'ops',
        'playerUseName', 
        'playerLastName', 
        'positionAbbrev']

## Trim df's and rename columns

In [15]:
dfs_standard = []
for df in dfs:
    df_short = df[keep]
    df_short.columns = MLB_columns
    dfs_standard.append(df_short)
    
# check the fist df 
dfs_standard[0]

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,First Name,Last Name,Position
0,ABQ,108,429,95,143,31,8,21,92,39,67,30,5,0.333,0.387,0.59,0.977,Wynton,Bernard,CF
1,ABQ,117,456,75,116,28,1,31,87,54,100,0,0,0.254,0.341,0.524,0.865,Carlos,Perez,DH
2,ABQ,111,420,82,115,27,4,20,77,50,131,13,1,0.274,0.359,0.5,0.859,Coco,Montes,2B
3,ABQ,99,369,64,92,15,4,5,38,52,69,10,6,0.249,0.345,0.352,0.697,Ryan,Vilade,RF


# Concatenate the list of df's into 1 df

In [16]:
MiLB = pd.concat(dfs_standard, ignore_index=True)

# Pickle MiLB df

In [17]:
pd.to_pickle(MiLB, "../pickled_tables/MiLB_df.pkl")

# Save team name abbreviations
This may not be needed but I would prefer to have if needed.

In [18]:
# look at columns names to know what columns to grab
dfs[0].columns

Index(['year', 'playerId', 'playerName', 'type', 'rank', 'playerFullName',
       'playerFirstName', 'playerLastName', 'playerUseName',
       'playerInitLastName', 'teamId', 'teamAbbrev', 'teamName',
       'teamShortName', 'leagueName', 'leagueId', 'positionAbbrev', 'position',
       'primaryPositionAbbrev', 'plateAppearances', 'totalBases', 'leftOnBase',
       'sacBunts', 'sacFlies', 'babip', 'extraBaseHits', 'hitByPitch', 'gidp',
       'gidpOpp', 'numberOfPitches', 'pitchesPerPlateAppearance',
       'walksPerPlateAppearance', 'strikeoutsPerPlateAppearance',
       'homeRunsPerPlateAppearance', 'walksPerStrikeout', 'iso',
       'reachedOnError', 'walkOffs', 'flyOuts', 'totalSwings',
       'swingAndMisses', 'ballsInPlay', 'popOuts', 'lineOuts', 'groundOuts',
       'flyHits', 'popHits', 'lineHits', 'groundHits', 'gamesPlayed',
       'airOuts', 'runs', 'doubles', 'triples', 'homeRuns', 'strikeOuts',
       'baseOnBalls', 'intentionalWalks', 'hits', 'avg', 'atBats', 'obp',
     

In [19]:
# check that these two columns are the correct ones
dfs[0].iloc[0][['teamAbbrev', 'teamName']]

teamAbbrev                     ABQ
teamName      Albuquerque Isotopes
Name: 0, dtype: object

In [20]:
# loop through all the dfs and use 'teamAbbrev' as key, 'teamName' as value
acronym_dic = {}

for df in dfs:
    key = df.iloc[0]['teamAbbrev']
    value = df.iloc[0]['teamName']
    
    acronym_dic[key] = value    
    
acronym_dic

{'ABQ': 'Albuquerque Isotopes',
 'BUF': 'Buffalo Bisons',
 'CLT': 'Charlotte Knights',
 'COL': 'Columbus Clippers',
 'DUR': 'Durham Bulls',
 'ELP': 'El Paso Chihuahuas',
 'GWN': 'Gwinnett Stripers',
 'IND': 'Indianapolis Indians',
 'IOW': 'Iowa Cubs',
 'JAX': 'Jacksonville Jumbo Shrimp',
 'LV': 'Las Vegas Aviators',
 'LHV': 'Lehigh Valley IronPigs',
 'LOU': 'Louisville Bats',
 'MEM': 'Memphis Redbirds',
 'NAS': 'Nashville Sounds',
 'NOR': 'Norfolk Tides',
 'OKC': 'Oklahoma City Dodgers',
 'OMA': 'Omaha Storm Chasers',
 'RNO': 'Reno Aces',
 'ROC': 'Rochester Red Wings',
 'RR': 'Round Rock Express',
 'SAC': 'Sacramento River Cats',
 'SL': 'Salt Lake Bees',
 'SWB': 'Scranton/Wilkes-Barre RailRiders',
 'STP': 'St. Paul Saints',
 'SUG': 'Sugar Land Space Cowboys',
 'SYR': 'Syracuse Mets',
 'TAC': 'Tacoma Rainiers',
 'TOL': 'Toledo Mud Hens',
 'WOR': 'Worcester Red Sox'}