# Introduction
This notebook creates and saves a dataframe for the Division I college teams.

# Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import json

# Notebook Setup

## Work around 403 error

In [2]:
# a small little ruse to aviod 403 error
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

## URL's

In [None]:
div_1_college_names_URL = 'https://www.thebaseballcube.com/content/schools/NCAA-1/'

json_URL = 'https://d1baseball.com/wp-content/themes/d1-wp/data/2022-players.json'

# Data Frame of Division I Colleges
Scraping website: [The Baseball Cube](https://www.thebaseballcube.com/content/schools/NCAA-1/)

## Create `div_1_colleges` DF

In [4]:
div_1_colleges = pd.read_html(div_1_college_names_URL, header=0)[0].dropna(axis=1)
div_1_colleges

Unnamed: 0,school name,college long,nickname,conference,place,reg
0,Abilene Christian,Abilene Christian University,Wildcats,Western Athletic Conference,"Abilene,Texas",TX
1,Air Force,United States Air Force Academy,Falcons,Mountain West Conference,"Colorado Springs,Colorado",CO
2,Akron,University of Akron,Zips,Mid-American Conference,"Akron,Ohio",OH
3,Alabama,University of Alabama,Crimson Tide,Southeastern Conference,"Tuscaloosa,Alabama",AL
4,Alabama A&M,Alabama A&M University,Bulldogs,Southwest Athletic Conference,"Huntsville,Alabama",AL
...,...,...,...,...,...,...
309,Xavier,Xavier University,Musketeers,Big East Conference,"Cincinnati,Ohio",OH
310,Yale,Yale University,Bulldogs,Ivy League,"New Haven,Connecticut",CT
311,school name,college long,nickname,conference,place,reg
312,Youngstown State,Youngstown State University,Penguins,Horizon League,"Youngstown,Ohio",OH


In [5]:
div_1_colleges.shape

(314, 6)

<div class="alert alert-block alert-info">
<b>NOTES:</b> It says there are 301 records but there are 314 rows. It looks like there is at least one extra row that repeats the header (see 311 above), it is likely the extra rows are from these repeat headers.
</div>

## Clean DF

In [6]:
# drop last row
div_1_colleges_pop = div_1_colleges[:-1]
div_1_colleges_pop.shape

(313, 6)

In [7]:
# select only `school name` and `college long`
div_1_names = div_1_colleges_pop[['school name', 'college long']]
div_1_names.shape

(313, 2)

In [8]:
# filter for extra rows
div_1_names[div_1_names['school name']=='school name']

Unnamed: 0,school name,college long
25,school name,college long
51,school name,college long
77,school name,college long
103,school name,college long
129,school name,college long
155,school name,college long
181,school name,college long
207,school name,college long
233,school name,college long
259,school name,college long


In [9]:
# remove repeat header rows and reset index
div_1_schools = div_1_names[div_1_names['school name'] != 'school name']

# check
div_1_schools.shape

(301, 2)

In [10]:
div_1_schools

Unnamed: 0,school name,college long
0,Abilene Christian,Abilene Christian University
1,Air Force,United States Air Force Academy
2,Akron,University of Akron
3,Alabama,University of Alabama
4,Alabama A&M,Alabama A&M University
...,...,...
307,Wofford,Wofford College
308,Wright State,Wright State University
309,Xavier,Xavier University
310,Yale,Yale University


In [11]:
div_1_schools_reset = div_1_schools.reset_index(drop=True)
div_1_schools_reset.shape

(301, 2)

In [12]:
div_1_schools_reset

Unnamed: 0,school name,college long
0,Abilene Christian,Abilene Christian University
1,Air Force,United States Air Force Academy
2,Akron,University of Akron
3,Alabama,University of Alabama
4,Alabama A&M,Alabama A&M University
...,...,...
296,Wofford,Wofford College
297,Wright State,Wright State University
298,Xavier,Xavier University
299,Yale,Yale University


In [13]:
len(div_1_schools_reset)

301

In [14]:
div_1_schools_reset.shape

(301, 2)

# Create List of Division I College Names

## Extract names from DF

In [15]:
# get just the school names
# extract out the division 1 school names form div_1_schools_reset df
just_names = [row['school name'] for row in div_1_schools_reset[['school name']].iloc()]

## Clean list

In [16]:
names_cleaned = [name.replace('-', ' ') for name in just_names]

In [17]:
# confirm still 301 records
len(names_cleaned)

301

In [18]:
# confirm there are no repeats
len(set(names_cleaned))

301

# Create JSON object
Scraping website: [D1 Baseball](https://d1baseball.com/statistics/)

JSON link found from:
- INSPECT > NETWORK 
 - NAME: 2022-players.json
   - HEADERS > GENERAL > REQUEST URL: https://d1baseball.com/wp-content/themes/d1-wp/data/2022-players.json
   

`json_object` is a list of dictionaries. Each dictionary has 4 keys:
- player id
- player name
- team name
- team slug 

In [21]:
response = requests.get(json_URL, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# save entire soup as a string
soup_string = str(soup.prettify)

# above step adds some text and tags, these need to be removed
soup_string = soup_string.replace('<bound method Tag.prettify of ', '')[:-1]

#json_object
json_object = json.loads(soup_string)

json_object[:20]

[{'d1bb_player_id': '00P9U056',
  'player_name': 'Justin Butz',
  'team_name': 'UT Rio Grande Valley',
  'team_slug': 'utrio'},
 {'d1bb_player_id': '00qhQnpd',
  'player_name': 'Matthew Polk',
  'team_name': 'Vanderbilt',
  'team_slug': 'vandy'},
 {'d1bb_player_id': '00t95RVa',
  'player_name': 'Luke Montenery',
  'team_name': 'The Citadel',
  'team_slug': 'citadel'},
 {'d1bb_player_id': '01liF6Ra',
  'player_name': 'Christopher Kahler',
  'team_name': 'George Washington',
  'team_slug': 'georgewash'},
 {'d1bb_player_id': '01OGoQFu',
  'player_name': 'Emmett Bice',
  'team_name': 'College of Charleston',
  'team_slug': 'charleston'},
 {'d1bb_player_id': '02gq6qBk',
  'player_name': 'Andrew Terrell',
  'team_name': 'Appalachian State',
  'team_slug': 'appalst'},
 {'d1bb_player_id': '02IV158V',
  'player_name': 'Brock Tibbitts',
  'team_name': 'Indiana',
  'team_slug': 'indiana'},
 {'d1bb_player_id': '02u27WTd',
  'player_name': 'Aerik Joe',
  'team_name': 'Air Force',
  'team_slug': 'ai

# Create a list of school names in `json_object`

In [31]:
# get a set of just the schools from json_object
json_school_names = set([player_dict['team_name'] for player_dict in json_object])

json_school_names[:10]

{'Abilene Christian',
 'Air Force',
 'Akron',
 'Alabama',
 'Alabama A&amp;M',
 'Alabama State',
 'Albany',
 'Alcorn State',
 'Appalachian State',
 'Arizona',
 'Arizona State',
 'Arkansas',
 'Arkansas State',
 'Arkansas-Pine Bluff',
 'Army',
 'Auburn',
 'Austin Peay',
 'BYU',
 'Ball State',
 'Baylor',
 'Bellarmine',
 'Belmont',
 'Bethune-Cookman',
 'Binghamton',
 'Boston College',
 'Bowling Green',
 'Bradley',
 'Brown',
 'Bryant',
 'Bucknell',
 'Butler',
 'CSU Bakersfield',
 'Cal Poly',
 'Cal State Fullerton',
 'Cal State Northridge',
 'California',
 'California Baptist',
 'Campbell',
 'Canisius',
 'Central Arkansas',
 'Central Connecticut',
 'Central Michigan',
 'Charleston Southern',
 'Charlotte',
 'Cincinnati',
 'Clemson',
 'Coastal Carolina',
 'College of Charleston',
 'Columbia',
 'Connecticut',
 'Coppin State',
 'Cornell',
 'Cornell (IA)',
 'Creighton',
 'Dallas Baptist',
 'Dartmouth',
 'Davidson',
 'Dayton',
 'Delaware',
 'Delaware State',
 'Dixie State',
 'Duke',
 'East Carolina

## Clean and sort list

In [37]:
# remove 'amp;'
step_1 = [name.replace('amp;', '') for name in json_school_names]

#replace '-' with a space
json_school_names_cleaned = [name.replace('-', ' ') for name in step_1]

# put in alphebetical order
json_school_names_cleaned.sort()

json_school_names_cleaned

['Abilene Christian',
 'Air Force',
 'Akron',
 'Alabama',
 'Alabama A&M',
 'Alabama State',
 'Albany',
 'Alcorn State',
 'Appalachian State',
 'Arizona',
 'Arizona State',
 'Arkansas',
 'Arkansas Pine Bluff',
 'Arkansas State',
 'Army',
 'Auburn',
 'Austin Peay',
 'BYU',
 'Ball State',
 'Baylor',
 'Bellarmine',
 'Belmont',
 'Bethune Cookman',
 'Binghamton',
 'Boston College',
 'Bowling Green',
 'Bradley',
 'Brown',
 'Bryant',
 'Bucknell',
 'Butler',
 'CSU Bakersfield',
 'Cal Poly',
 'Cal State Fullerton',
 'Cal State Northridge',
 'California',
 'California Baptist',
 'Campbell',
 'Canisius',
 'Central Arkansas',
 'Central Connecticut',
 'Central Michigan',
 'Charleston Southern',
 'Charlotte',
 'Cincinnati',
 'Clemson',
 'Coastal Carolina',
 'College of Charleston',
 'Columbia',
 'Connecticut',
 'Coppin State',
 'Cornell',
 'Cornell (IA)',
 'Creighton',
 'Dallas Baptist',
 'Dartmouth',
 'Davidson',
 'Dayton',
 'Delaware',
 'Delaware State',
 'Dixie State',
 'Duke',
 'East Carolina',
 

# Match json_object school names to list of division 1 school names

In order to extract out slugs, college names from json_object need to match those from list of division 1 colleges.

## Print mismatched names

In [38]:
# check what schools from list of division 1 schools (`names_cleaned`) are 
# not hitting in `json_school_names_cleaned`
for name in names_cleaned:
    if name not in json_school_names_cleaned:
        print(name) # these are division 1 schools not matching up with json_object school names

Alabama Birmingham
Cal Baptist
Cal State Bakersfield
Central Connecticut State
Central Florida
Citadel
Col of Charleston
East Tennessee state
Grambling State
Louisiana Monroe
Mass Lowell
McNeese State
Miami Ohio
Nebraska Omaha
Nicholls State
North Carolina State
Penn
Prairie View A&M
Sam Houston State
St. Mary's CA
St. Peter's
St. Thomas MN
Tarleton State
Texas Christian
UNC Charlotte
USC
UT San Antonio
Virginia Commonwealth
William and Mary


# Create `update` dictionary 

Update list of Division I colleges to match college names from json_object

In [51]:
# Had to manually create by comparing json_object & div_1_colleges (and googling) 

#       names_cleaned            : json_school_names_cleaned
update = {
    'Alabama Birmingham'         : 'UAB',
    'Cal Baptist'                : 'California Baptist',
    'Cal State Bakersfield'      : 'CSU Bakersfield',
    'Central Connecticut State'  : 'Central Connecticut',
    'Central Florida'            : 'UCF',
    'Citadel'                    : 'The Citadel',
    'Col of Charleston'          : 'College of Charleston',
    'East Tennessee state'       : 'East Tennessee State',
    'Grambling State'            : 'Grambling',
    'Louisiana Monroe'           : 'UL Monroe',
    'Mass Lowell'                : 'UMass Lowell',
    'McNeese State'              : 'McNeese',
    'Miami Ohio'                 : 'Miami (OH)',
    'Nebraska Omaha'             : 'Omaha',
    'Nicholls State'             : 'Nicholls',
    'North Carolina State'       : 'NC State',
    'Penn'                       : 'Pennsylvania',
    'Prairie View A&M'           : 'Prairie View',
    'Sam Houston State'          : 'Sam Houston',
    "St. Mary's CA"              : "Saint Mary's",
    "St. Peter's"                : "Saint Peter's",
    'St. Thomas MN'              : 'St. Thomas',
    'Tarleton State'             : 'Tarleton',
    'Texas Christian'            : 'TCU',
    'UNC Charlotte'              : 'Charlotte',
    'USC'                        : 'Southern California',
    'UT San Antonio'             : 'UTSA',
    'Virginia Commonwealth'      : 'VCU',
    'William and Mary'           : 'William & Mary'
}

In [53]:
# check if any of the values already appear in `names_cleaned`
for (k,v) in update.items():
    if v in names_cleaned:
        print(v)

# Update list of Division I school names

In [57]:
# update `names_cleaned` with names as they appear in json_object
div_1_cleaned = []
for name in names_cleaned:
    if name in update.keys():
        div_1_cleaned.append(update[name])
    else:
        div_1_cleaned.append(name)

<div class="alert alert-block alert-warning">
<b>Recheck:</b> If nothing prints then ALL of the division 1 schools appear in the json_object.
</div>


In [59]:
# see what schools from list of division 1 schools (now `div_1_cleaned`) are not found in json object
for name in div_1_cleaned:
    if name not in json_school_names_cleaned:
        print(name) # these are division 1 schools not matching up with json_object school names

In [65]:
# one more check, there should still be 301
len(set(div_1_cleaned))

301

<div class="alert alert-block alert-success">
<b>Success:</b> The list of Division I teams now match the json object school names. I can now use this updated list to extract slugs from json dictionaries.  
</div>

# Extract slugs

## Clean `json_object` values

In [117]:
# fix `team_name` in json_object dictionaries
for player_dict in json_object:
    player_dict['team_name'] = player_dict['team_name'].replace('amp;', '')
    player_dict['team_name'] = player_dict['team_name'].replace('-', ' ')   
    
json_object[:25]

[{'d1bb_player_id': '00P9U056',
  'player_name': 'Justin Butz',
  'team_name': 'UT Rio Grande Valley',
  'team_slug': 'utrio'},
 {'d1bb_player_id': '00qhQnpd',
  'player_name': 'Matthew Polk',
  'team_name': 'Vanderbilt',
  'team_slug': 'vandy'},
 {'d1bb_player_id': '00t95RVa',
  'player_name': 'Luke Montenery',
  'team_name': 'The Citadel',
  'team_slug': 'citadel'},
 {'d1bb_player_id': '01liF6Ra',
  'player_name': 'Christopher Kahler',
  'team_name': 'George Washington',
  'team_slug': 'georgewash'},
 {'d1bb_player_id': '01OGoQFu',
  'player_name': 'Emmett Bice',
  'team_name': 'College of Charleston',
  'team_slug': 'charleston'},
 {'d1bb_player_id': '02gq6qBk',
  'player_name': 'Andrew Terrell',
  'team_name': 'Appalachian State',
  'team_slug': 'appalst'},
 {'d1bb_player_id': '02IV158V',
  'player_name': 'Brock Tibbitts',
  'team_name': 'Indiana',
  'team_slug': 'indiana'},
 {'d1bb_player_id': '02u27WTd',
  'player_name': 'Aerik Joe',
  'team_name': 'Air Force',
  'team_slug': 'ai

In [95]:
"""
# check other way around
not_there = []
for player_dict in json_object:
    if (player_dict['team_name'] not in div_1_cleaned) & (player_dict['team_name'] not in not_there):
        not_there.append(player_dict['team_name'])
        print(player_dict['team_name'])

# check
not_there[:5]
"""
print()

Cornell (IA)


['Cornell (IA)']

In [96]:
"""
# check `div_1_colleges` df for schools long name and location
div_1_colleges.loc[div_1_colleges['school name'].str.contains("Cornell", case=False)]
"""
print()

Unnamed: 0,school name,college long,nickname,conference,place,reg
55,Cornell,Cornell University,Big Red,Ivy League,"Ithaca,New York",NY


In [98]:
"""
# check json_school_names_cleaned for 'Cornell'
for name in json_school_names_cleaned:
    if 'Cornell' in name:
        print(name)
        
"""
print()

Cornell
Cornell (IA)


In [None]:
# Cornell (IA) - Iowa - is division III, this is why it is not in `div_1_colleges` list

## Loop through dictionaries and extract slugs

In [105]:
# loop through dicts in json_object, aif `team_name` in div_1_cleaned: append slugs to a new list
team_slugs = []
for player_dict in json_object:
    if player_dict['team_name'] in div_1_cleaned:
        team_slugs.append(player_dict['team_slug'])

In [106]:
# should be large list
len(url_slugs)

10033

In [107]:
# just the unique
url_slugs = set(url_slugs)

len(url_slugs)

301

In [108]:
len(url_slugs) == len(div_1_cleaned)

True

# Make list of URL's

In [109]:
# make list of URL's 

team_urls = []

for college_slug in url_slugs:
    team_url = f'https://d1baseball.com/team/{college_slug}/stats/'
    team_urls.append(team_url)

# check
team_urls[:5]

['https://d1baseball.com/team/coppinst/stats/',
 'https://d1baseball.com/team/princeton/stats/',
 'https://d1baseball.com/team/nwstate/stats/',
 'https://d1baseball.com/team/georgiast/stats/',
 'https://d1baseball.com/team/uncwilm/stats/']

In [110]:
len(team_urls)

301

# Create batting tables from Division I colleges

In [111]:
# loop through urls and get batting stats tables
dfs = []

for URL in team_urls:
    
    response = requests.get(URL, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # save entire soup as a string
    soup_string = str(soup.prettify)

    # above step adds some text and tags, these need to be removed
    soup_string = soup_string.replace('<bound method Tag.prettify of ', '')[:-2]

    # batting table
    batting_table = pd.read_html(soup_string)[6]

    dfs.append(batting_table)

In [112]:
len(dfs)

301

In [116]:
dfs[0]

Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
0,Jordan Hamberg,Coppin State,DH,0.347,0.463,0.611,1.074,45,144,34,50,10,2,8,34,8,24,22,11,2
1,Bruce Hilton,Coppin State,2B,0.333,0.5,0.667,1.167,9,9,2,3,3,0,0,3,0,3,4,1,0
2,Sebastien Sarabia,Coppin State,1B,0.32,0.406,0.426,0.832,48,169,30,54,8,2,2,37,1,27,24,3,3
3,Corey Miley,Coppin State,2B,0.319,0.478,0.428,0.906,50,138,28,44,11,2,0,27,6,37,22,3,1
4,Marcos Castillo,Coppin State,RF,0.316,0.409,0.49,0.899,54,196,39,62,13,3,5,40,11,23,33,18,4
5,Tyler Lloyd,Coppin State,2B,0.31,0.412,0.414,0.826,27,58,10,18,4,1,0,5,2,8,11,5,1
6,Wellington Balsley,Coppin State,LF,0.291,0.409,0.364,0.773,54,206,49,60,5,5,0,32,7,36,22,27,8
7,Brian Nicolas,Coppin State,3B,0.286,0.398,0.476,0.874,51,168,38,48,6,4,6,39,6,26,45,17,0
8,Mike Dorcean,Coppin State,C,0.269,0.458,0.385,0.843,44,104,30,28,7,1,1,18,14,23,29,4,2
9,Josh Hankins,Coppin State,SS,0.261,0.349,0.332,0.681,51,184,43,48,10,0,1,27,6,20,30,13,4


In [None]:


###### this WORKS ######
# this is proof of concept using airforce college

# this is proof of concept using airforce college
URL = 'https://d1baseball.com/team/airforce/stats/'

response = requests.get(URL, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# save entire soup as a string
soup_string = str(soup.prettify)

# above step adds some text and tags, these need to be removed
soup_string = soup_string.replace('<bound method Tag.prettify of ', '')[:-2]

# batting table
batting_table = pd.read_html(soup_string)[6]

batting_table




###### this WORKS ######

In [None]:
len(url_slugs)

In [None]:
len(set(url_slugs))

In [None]:
########## find the school not included in `url_slugs` ##########
school_names_from_json = []
for player_dict in json_object:
    if player_dict['team_name'] in div_1_schools:
        school_names_from_json.append(player_dict['team_name'])

# check
school_names_from_json[:5]

In [None]:
len(set(school_names_from_json))

In [None]:
for school in div_1_schools:
    if school not in set(school_names_from_json):
        print(school)

In [None]:
## explore json object to see why not included
for player_dict in json_object:
    player_dict['team_name'] = player_dict['team_name'].replace('amp;', '')
    player_dict['team_name'] = player_dict['team_name'].replace('-', ' ')
    if 'Birmingham' in player_dict['team_name']:
        print(player_dict['team_name'])


In [None]:
# remove 'amp;' , '-'

In [None]:
########## find the school not included in `url_slugs` ##########

In [None]:
len(url_slugs) == len(names_cleaned)