# Introduction
This notebook creates and saves a dataframe for the Division I college teams.

# Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import json

# Notebook Setup

## Work around 403 error

In [2]:
# a small little ruse to aviod 403 error
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

## URL's
`div_1_college_names_URL` is for creating list of all Division 1 college names.

`json_URL` uses the above list to extract out slugs for scraping stats tables from [D1 Baseball](https://d1baseball.com/statistics/)

In [3]:
div_1_college_names_URL = 'https://www.thebaseballcube.com/content/schools/NCAA-1/'

json_URL = 'https://d1baseball.com/wp-content/themes/d1-wp/data/2022-players.json'

# Data Frame of Division I Colleges
Scraping website: [The Baseball Cube](https://www.thebaseballcube.com/content/schools/NCAA-1/)

## Create `div_1_colleges` DF

In [4]:
div_1_colleges = pd.read_html(div_1_college_names_URL, header=0)[0].dropna(axis=1)
div_1_colleges

Unnamed: 0,school name,college long,nickname,conference,place,reg
0,Abilene Christian,Abilene Christian University,Wildcats,Western Athletic Conference,"Abilene,Texas",TX
1,Air Force,United States Air Force Academy,Falcons,Mountain West Conference,"Colorado Springs,Colorado",CO
2,Akron,University of Akron,Zips,Mid-American Conference,"Akron,Ohio",OH
3,Alabama,University of Alabama,Crimson Tide,Southeastern Conference,"Tuscaloosa,Alabama",AL
4,Alabama A&M,Alabama A&M University,Bulldogs,Southwest Athletic Conference,"Huntsville,Alabama",AL
...,...,...,...,...,...,...
309,Xavier,Xavier University,Musketeers,Big East Conference,"Cincinnati,Ohio",OH
310,Yale,Yale University,Bulldogs,Ivy League,"New Haven,Connecticut",CT
311,school name,college long,nickname,conference,place,reg
312,Youngstown State,Youngstown State University,Penguins,Horizon League,"Youngstown,Ohio",OH


In [5]:
div_1_colleges.shape

(314, 6)

<div class="alert alert-block alert-warning">
<b>Remove Extra Rows:</b> It says there are 301 records but there are 314 rows. It looks like there is at least one extra row that repeats the header (see 311 above), it is likely the extra rows are from these repeat headers. Need to remove these rows to confirm there are 301 records.
</div>

## Clean DF

In [6]:
# drop last row
div_1_colleges_pop = div_1_colleges[:-1]
div_1_colleges_pop.shape

(313, 6)

In [7]:
# remove repeat header rows
div_1_schools = div_1_colleges_pop[div_1_colleges_pop['school name'] != 'school name']

# check
div_1_schools.shape

(301, 6)

<div class="alert alert-block alert-success">
<b>Success:</b> Down to 301 records. Now I can loop through cleaned df to get a list of Division I Colleges.
</div>

# Create List of Division I Colleges

## Extract names from DF

In [8]:
# extract out the school names form div_1_schools df
just_names = [row['school name'] for row in div_1_schools[['school name']].iloc()]

## Clean list

In [9]:
names_cleaned = [name.replace('-', ' ') for name in just_names]

In [10]:
# confirm still 301 records
len(names_cleaned)

301

In [11]:
# confirm there are no repeats
len(set(names_cleaned))

301

<div class="alert alert-block alert-success">
<b>Success:</b> There are 301 unique school names. I will use this list to know which slugs to extract from below JSON object.
</div>

# Create JSON object
Scraping website: [D1 Baseball](https://d1baseball.com/statistics/)

`json_URL` found from:
- INSPECT > NETWORK 
 - NAME: 2022-players.json 
   - HEADERS > GENERAL > REQUEST URL: https://d1baseball.com/wp-content/themes/d1-wp/data/2022-players.json
   

`json_object` is a list of dictionaries. Each dictionary has 4 keys:
- player id
- player name
- team name
- team slug 

In [12]:
response = requests.get(json_URL, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# save entire soup as a string
soup_string = str(soup.prettify)

# above step adds some text and tags, these need to be removed
soup_string = soup_string.replace('<bound method Tag.prettify of ', '')[:-1]

#json_object
json_object = json.loads(soup_string)

json_object[:20]

[{'d1bb_player_id': '00P9U056',
  'player_name': 'Justin Butz',
  'team_name': 'UT Rio Grande Valley',
  'team_slug': 'utrio'},
 {'d1bb_player_id': '00qhQnpd',
  'player_name': 'Matthew Polk',
  'team_name': 'Vanderbilt',
  'team_slug': 'vandy'},
 {'d1bb_player_id': '00t95RVa',
  'player_name': 'Luke Montenery',
  'team_name': 'The Citadel',
  'team_slug': 'citadel'},
 {'d1bb_player_id': '01liF6Ra',
  'player_name': 'Christopher Kahler',
  'team_name': 'George Washington',
  'team_slug': 'georgewash'},
 {'d1bb_player_id': '01OGoQFu',
  'player_name': 'Emmett Bice',
  'team_name': 'College of Charleston',
  'team_slug': 'charleston'},
 {'d1bb_player_id': '02gq6qBk',
  'player_name': 'Andrew Terrell',
  'team_name': 'Appalachian State',
  'team_slug': 'appalst'},
 {'d1bb_player_id': '02IV158V',
  'player_name': 'Brock Tibbitts',
  'team_name': 'Indiana',
  'team_slug': 'indiana'},
 {'d1bb_player_id': '02u27WTd',
  'player_name': 'Aerik Joe',
  'team_name': 'Air Force',
  'team_slug': 'ai

<div class="alert alert-block alert-warning">
<b>Cleaning Required:</b> I can see there is some cleaning that needs to happen, the ampersand (&) has an extra string 'amp;' after it that needs to be removed. Just for the sake of being thorough, I will also remove hyphens (-) even though I do not see any in the sample above.  
</div>

# Create a list of school names in `json_object`

In [13]:
# get a set of just the schools from json_object
json_school_names = list(set([player_dict['team_name'] for player_dict in json_object]))

# filter for examples of needed cleaning
filtered = filter(lambda name: 'Texas A&' in name, json_school_names)

print(list(filtered))

['Texas A&amp;M-Corpus Christi', 'Texas A&amp;M']


<div class="alert alert-block alert-info">
<b>Notes:</b> Above 2 are perfect examples of aforementioned required cleaning.  
</div>

## Clean and sort list

In [14]:
# remove 'amp;'
step_1 = [name.replace('amp;', '') for name in json_school_names]

#replace '-' with a space
json_school_names_cleaned = [name.replace('-', ' ') for name in step_1]

# put in alphebetical order - to make comparison easier
json_school_names_cleaned.sort()

# view ENTIRE list, needed for troubleshooting/comparing mismatched 
# names between this list and `names_cleaned`
json_school_names_cleaned

['Abilene Christian',
 'Air Force',
 'Akron',
 'Alabama',
 'Alabama A&M',
 'Alabama State',
 'Albany',
 'Alcorn State',
 'Appalachian State',
 'Arizona',
 'Arizona State',
 'Arkansas',
 'Arkansas Pine Bluff',
 'Arkansas State',
 'Army',
 'Auburn',
 'Austin Peay',
 'BYU',
 'Ball State',
 'Baylor',
 'Bellarmine',
 'Belmont',
 'Bethune Cookman',
 'Binghamton',
 'Boston College',
 'Bowling Green',
 'Bradley',
 'Brown',
 'Bryant',
 'Bucknell',
 'Butler',
 'CSU Bakersfield',
 'Cal Poly',
 'Cal State Fullerton',
 'Cal State Northridge',
 'California',
 'California Baptist',
 'Campbell',
 'Canisius',
 'Central Arkansas',
 'Central Connecticut',
 'Central Michigan',
 'Charleston Southern',
 'Charlotte',
 'Cincinnati',
 'Clemson',
 'Coastal Carolina',
 'College of Charleston',
 'Columbia',
 'Connecticut',
 'Coppin State',
 'Cornell',
 'Cornell (IA)',
 'Creighton',
 'Dallas Baptist',
 'Dartmouth',
 'Davidson',
 'Dayton',
 'Delaware',
 'Delaware State',
 'Dixie State',
 'Duke',
 'East Carolina',
 

<div class="alert alert-block alert-info">
<b>NOTES:</b> This list contains names of all colleges, all divisions; this is why it was import to create `names_cleaned` - a list on only Division I Colleges - to have a key of which slugs to extract. 
</div>

# Match json_object school names to list of division 1 school names

In order to extract out slugs, the college names from `json_object` - the object that contains slugs associated with each college - need to match those from `names_cleaned` - the list of Division I Colleges. I will use `json_school_names_cleaned` - the list of colleges as they appear in `json_object` - to figure out how to clean the names and to filter through which names need updating.  

## Print mismatched names

In [15]:
# check what schools from list of division 1 schools (`names_cleaned`) are 
# not hitting in `json_school_names_cleaned`
for name in names_cleaned:
    if name not in json_school_names_cleaned:
        print(name) # these are division 1 schools not matching up with json_object school names

Alabama Birmingham
Cal Baptist
Cal State Bakersfield
Central Connecticut State
Central Florida
Citadel
Col of Charleston
East Tennessee state
Grambling State
Louisiana Monroe
Mass Lowell
McNeese State
Miami Ohio
Nebraska Omaha
Nicholls State
North Carolina State
Penn
Prairie View A&M
Sam Houston State
St. Mary's CA
St. Peter's
St. Thomas MN
Tarleton State
Texas Christian
UNC Charlotte
USC
UT San Antonio
Virginia Commonwealth
William and Mary


## Create `update` dictionary 

Update list of Division I colleges to match college names from json_object

In [16]:
# Had to manually create by comparing json_school_names_cleaned & names_cleaned (and googling) 

#       names_cleaned            : json_school_names_cleaned
update = {
    'Alabama Birmingham'         : 'UAB',
    'Cal Baptist'                : 'California Baptist',
    'Cal State Bakersfield'      : 'CSU Bakersfield',
    'Central Connecticut State'  : 'Central Connecticut',
    'Central Florida'            : 'UCF',
    'Citadel'                    : 'The Citadel',
    'Col of Charleston'          : 'College of Charleston',
    'East Tennessee state'       : 'East Tennessee State',
    'Grambling State'            : 'Grambling',
    'Louisiana Monroe'           : 'UL Monroe',
    'Mass Lowell'                : 'UMass Lowell',
    'McNeese State'              : 'McNeese',
    'Miami Ohio'                 : 'Miami (OH)',
    'Nebraska Omaha'             : 'Omaha',
    'Nicholls State'             : 'Nicholls',
    'North Carolina State'       : 'NC State',
    'Penn'                       : 'Pennsylvania',
    'Prairie View A&M'           : 'Prairie View',
    'Sam Houston State'          : 'Sam Houston',
    "St. Mary's CA"              : "Saint Mary's",
    "St. Peter's"                : "Saint Peter's",
    'St. Thomas MN'              : 'St. Thomas',
    'Tarleton State'             : 'Tarleton',
    'Texas Christian'            : 'TCU',
    'UNC Charlotte'              : 'Charlotte',
    'USC'                        : 'Southern California',
    'UT San Antonio'             : 'UTSA',
    'Virginia Commonwealth'      : 'VCU',
    'William and Mary'           : 'William & Mary'
}

In [17]:
# check if any of the values already appear in `names_cleaned`
for (k,v) in update.items():
    if v in names_cleaned:
        print(v)

<div class="alert alert-block alert-success">
<b>Success:</b> Nothing printed. This means the new values do not already exist in `names-cleaned`, which would indicate incorrect labeling. I can now use this dictionary to update the names in `names_cleaned` to match those in `json_object`
</div>

## Update list of Division I school names

In [18]:
# update `names_cleaned` with names as they appear in json_object
div_1_cleaned = []
for name in names_cleaned:
    if name in update.keys():
        div_1_cleaned.append(update[name])
    else:
        div_1_cleaned.append(name)

<div class="alert alert-block alert-warning">
<b>Recheck:</b> If nothing prints then ALL of the division 1 schools appear in the json_object.
</div>


In [19]:
# see what schools from list of division 1 schools (now `div_1_cleaned`) are not found in json object
for name in div_1_cleaned:
    if name not in json_school_names_cleaned:
        print(name) # these are division 1 schools not matching up with json_object school names

In [20]:
# one more check, there should still be 301 unique names
len(set(div_1_cleaned))

301

<div class="alert alert-block alert-success">
<b>Success:</b> Nothing printed. `names_cleaned` - the list of Division I schools - now matches the json objects school names. I will now use this updated list to extract appropiate slugs from json dictionaries.  
</div>

# Extract slugs

## Clean `json_object` values

In [21]:
# clean `team_name` in json_object dictionaries using same steps from "Clean and sort list"
for player_dict in json_object:
    player_dict['team_name'] = player_dict['team_name'].replace('amp;', '')
    player_dict['team_name'] = player_dict['team_name'].replace('-', ' ')   
    
json_object[10:15]

[{'d1bb_player_id': '04ZlwxGj',
  'player_name': 'Alec Carr',
  'team_name': 'Incarnate Word',
  'team_slug': 'incarnword'},
 {'d1bb_player_id': '05CWsb8R',
  'player_name': 'Dane Hoggard',
  'team_name': 'Seton Hall',
  'team_slug': 'setonhall'},
 {'d1bb_player_id': '05gCZUXN',
  'player_name': 'Drue Hackenberg',
  'team_name': 'Virginia Tech',
  'team_slug': 'vatech'},
 {'d1bb_player_id': '05rudsvS',
  'player_name': 'Michael Bohlen',
  'team_name': 'Northern Kentucky',
  'team_slug': 'nkentucky'},
 {'d1bb_player_id': '05ZZV0RB',
  'player_name': 'Kelyn Fox',
  'team_name': 'Florida A&M',
  'team_slug': 'floridaam'}]

<div class="alert alert-block alert-info">
<b>NOTES:</b> Notice last dictionary in above sample has a cleaned 'Florida A&amp;M'. 
</div>

In [None]:
"""
# check other way around
not_there = []
for player_dict in json_object:
    if (player_dict['team_name'] not in div_1_cleaned) & (player_dict['team_name'] not in not_there):
        not_there.append(player_dict['team_name'])
        print(player_dict['team_name'])

# check
not_there[:5]
"""
print()

In [None]:
"""
# check `div_1_colleges` df for schools long name and location
div_1_colleges.loc[div_1_colleges['school name'].str.contains("Cornell", case=False)]
"""
print()

In [None]:
"""
# check json_school_names_cleaned for 'Cornell'
for name in json_school_names_cleaned:
    if 'Cornell' in name:
        print(name)
        
"""
print()

In [None]:
# Cornell (IA) - Iowa - is division III, this is why it is not in `div_1_colleges` list

## Loop through dictionaries and extract slugs

In [22]:
team_slugs = []

# loop through dicts in json_object
for player_dict in json_object:
    # if schools name is in list of Division 1 Colleges
    if player_dict['team_name'] in div_1_cleaned:  
        # then append that school's slug to `team_slugs` list
        team_slugs.append(player_dict['team_slug'])  

In [23]:
# should be large list
len(team_slugs)

10033

In [24]:
# just the unique slugs
url_slugs = set(team_slugs)

len(url_slugs)

301

<div class="alert alert-block alert-success">
<b>Success:</b> There are 301 unique slugs, the same number of Division I colleges.  
</div>

# Create list of URL's

In [25]:
# make list of URL's 

team_urls = []

for college_slug in url_slugs:
    team_url = f'https://d1baseball.com/team/{college_slug}/stats/'
    team_urls.append(team_url)

# check
team_urls[:5]

['https://d1baseball.com/team/yale/stats/',
 'https://d1baseball.com/team/unc/stats/',
 'https://d1baseball.com/team/ulamo/stats/',
 'https://d1baseball.com/team/xavier/stats/',
 'https://d1baseball.com/team/illinois/stats/']

In [26]:
len(team_urls)

301

# Create batting tables from Division I colleges
<div class="alert alert-block alert-danger">
<b>Warning:</b> This cell takes about 5 mins to run. To bypass, when re running notebook, skip ahead to "Load `all_college` DF".
</div>

In [27]:
# loop through urls and get batting stats tables
dfs = []

for URL in team_urls:
    
    response = requests.get(URL, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # save entire soup as a string
    soup_string = str(soup.prettify)

    # above step adds some text and tags, these need to be removed
    soup_string = soup_string.replace('<bound method Tag.prettify of ', '')[:-2]

    # batting table
    batting_table = pd.read_html(soup_string)[6]

    dfs.append(batting_table)

In [28]:
len(dfs)

301

# Concatenate list of DF's into 1 `all_college` DF

In [29]:
all_college = pd.concat(dfs, ignore_index=True)

In [30]:
all_college

Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
0,Max Imhoff,Yale,C,0.320,0.393,0.520,0.913,13,25,2,8,2,0,1,4,0,3,10,0,1
1,Jimmy Chatfield,Yale,RF,0.303,0.472,0.523,0.995,38,132,30,40,5,0,8,30,10,34,27,10,4
2,Teddy Hague,Yale,1B,0.292,0.377,0.462,0.839,18,65,10,19,2,0,3,11,0,10,10,1,0
3,Mason LaPlante,Yale,SS,0.281,0.379,0.379,0.758,38,153,34,43,9,0,2,16,5,21,24,19,2
4,Jake Gehri,Yale,C,0.279,0.371,0.519,0.890,38,129,23,36,4,0,9,26,7,13,29,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5425,Peter Serruto,Indiana,C,0.139,0.200,0.190,0.390,34,79,7,11,1,0,1,6,1,5,24,0,0
5426,Kip Fougerousse,Indiana,1B,0.111,0.172,0.222,0.394,13,27,3,3,0,0,1,6,1,1,13,0,0
5427,Ethan Vecrumba,Indiana,DH,0.000,0.571,0.000,0.571,16,3,7,0,0,0,0,1,1,3,3,0,0
5428,Jake Skrine,Indiana,PH,0.000,0.000,0.000,0.000,2,2,0,0,0,0,0,0,0,0,1,0,0


# Pickle `all_college` DF

In [31]:
pd.to_pickle(all_college, "./all_college_df.pkl")

# Load `all_college` DF

In [32]:
all_college = pd.read_pickle("./all_college_df.pkl")

In [33]:
all_college

Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
0,Max Imhoff,Yale,C,0.320,0.393,0.520,0.913,13,25,2,8,2,0,1,4,0,3,10,0,1
1,Jimmy Chatfield,Yale,RF,0.303,0.472,0.523,0.995,38,132,30,40,5,0,8,30,10,34,27,10,4
2,Teddy Hague,Yale,1B,0.292,0.377,0.462,0.839,18,65,10,19,2,0,3,11,0,10,10,1,0
3,Mason LaPlante,Yale,SS,0.281,0.379,0.379,0.758,38,153,34,43,9,0,2,16,5,21,24,19,2
4,Jake Gehri,Yale,C,0.279,0.371,0.519,0.890,38,129,23,36,4,0,9,26,7,13,29,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5425,Peter Serruto,Indiana,C,0.139,0.200,0.190,0.390,34,79,7,11,1,0,1,6,1,5,24,0,0
5426,Kip Fougerousse,Indiana,1B,0.111,0.172,0.222,0.394,13,27,3,3,0,0,1,6,1,1,13,0,0
5427,Ethan Vecrumba,Indiana,DH,0.000,0.571,0.000,0.571,16,3,7,0,0,0,0,1,1,3,3,0,0
5428,Jake Skrine,Indiana,PH,0.000,0.000,0.000,0.000,2,2,0,0,0,0,0,0,0,0,1,0,0


# Create `first_names` & `last_names` lists

## create a list `full_names` to extract first and last names from

In [34]:
full_names = [row['Player'] for row in all_college.iloc()]

In [35]:
# check the length
len(full_names)

5430

In [36]:
# check for duplicates
len(set(full_names))

5395

## check duplicates

In [37]:
# extract out duplicate names
first_pass = []
duplicate_names = []
for row in all_college.iloc():
    if row['Player'] not in first_pass:
        first_pass.append(row['Player'])
    else:
        duplicate_names.append(row['Player'])
        
duplicate_names

['Carson Jones',
 'Justin Taylor',
 'Dylan Johnson',
 'Brendan Jones',
 'Sam Thompson',
 'Matt Thomas',
 'Jared Johnson',
 'Peyton Williams',
 'Alex Aguila',
 'Tyler Cox',
 'Trevor Austin',
 'Tanner Smith',
 'Ryan McCarthy',
 'Tyler Davis',
 'Payton Allen',
 'Jordan Thompson',
 'Jared Miller',
 'Jimmy Sullivan',
 'Connor OBrien',
 'Ryan Callahan',
 'Carson Yates',
 'Tyler Williams',
 'Jackson Green',
 'Ben Newton',
 'Hayden Harris',
 'Dylan Stewart',
 'Ryan Howe',
 'Marcus Brown',
 'Kyle Hess',
 'Miguel Rivera',
 'Michael Pollard',
 'Will Long',
 'Jordan Thompson',
 'Luis Rodriguez',
 'Jack Anderson']

In [38]:
# check that the duplicate names are not duplicate records
for name in duplicate_names:
    display(all_college[all_college['Player'] == name])

Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
49,Carson Jones,UL Monroe,2B,0.212,0.307,0.364,0.671,50,132,18,28,6,1,4,31,2,17,39,6,4
1036,Carson Jones,Virginia Tech,RF,0.327,0.439,0.8,1.239,21,55,19,18,2,0,8,23,2,9,13,0,3


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
896,Justin Taylor,Texas A&M-Corpus Christi,C,0.215,0.299,0.319,0.618,45,144,23,31,9,0,2,16,6,12,55,0,3
1237,Justin Taylor,UMBC,CF,0.309,0.438,0.463,0.901,54,188,38,58,11,0,6,34,18,29,43,8,1


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
900,Dylan Johnson,Texas A&M-Corpus Christi,,0.0,0.143,0.0,0.143,5,6,0,0,0,0,0,0,0,1,1,1,0
1539,Dylan Johnson,Gonzaga,LF,0.339,0.406,0.458,0.864,23,59,7,20,4,0,1,10,1,7,18,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
555,Brendan Jones,Kansas State,PH,0.5,0.583,0.5,1.083,29,8,7,4,0,0,0,4,0,3,2,2,0
1599,Brendan Jones,Holy Cross,3B,0.253,0.372,0.319,0.691,48,166,26,42,5,3,0,11,1,31,39,6,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
188,Sam Thompson,Kent State,C,0.125,0.206,0.161,0.367,24,56,3,7,2,0,0,8,1,5,15,1,0
2070,Sam Thompson,TCU,LF,0.154,0.333,0.423,0.756,19,26,9,4,1,0,2,7,0,7,12,2,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
295,Matt Thomas,Cal State Northridge,PH,0.0,0.429,0.0,0.429,7,4,1,0,0,0,0,0,2,1,3,0,0
2075,Matt Thomas,William & Mary,RF,0.318,0.416,0.502,0.918,49,201,41,64,12,2,7,34,9,26,30,10,4


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
594,Jared Johnson,Brown,LF,0.292,0.413,0.538,0.951,22,65,11,19,2,1,4,11,2,12,21,2,3
2257,Jared Johnson,Bowling Green,RF,0.22,0.256,0.512,0.768,22,41,8,9,3,0,3,7,0,2,16,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
971,Peyton Williams,Iowa,1B,0.335,0.464,0.622,1.086,54,209,55,70,17,2,13,41,16,35,35,4,0
2368,Peyton Williams,Purdue Fort Wayne,C,0.0,0.176,0.0,0.176,12,14,3,0,0,0,0,0,2,1,4,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
2582,Alex Aguila,Appalachian State,SS,0.281,0.347,0.327,0.674,45,153,18,43,2,1,1,22,2,15,36,2,0
2590,Alex Aguila,Appalachian State,,0.2,0.25,0.2,0.45,4,10,0,2,0,0,0,3,0,1,4,1,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
225,Tyler Cox,Dartmouth,SS,0.402,0.452,0.484,0.936,43,184,41,74,10,1,1,40,3,17,12,11,3
2642,Tyler Cox,West Virginia,LF,0.086,0.195,0.114,0.309,31,35,5,3,1,0,0,1,0,5,12,4,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
262,Trevor Austin,Missouri,LF,0.297,0.417,0.476,0.893,42,145,35,43,8,0,6,22,13,19,32,3,1
2654,Trevor Austin,Mercer,3B,0.272,0.366,0.353,0.719,47,173,29,47,8,0,2,27,2,25,33,6,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
107,Tanner Smith,Oregon State,C,0.232,0.327,0.411,0.738,31,95,11,22,8,0,3,13,1,13,32,0,0
2933,Tanner Smith,Oregon,LF,0.317,0.37,0.489,0.859,61,268,58,85,13,0,11,38,5,18,37,12,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
135,Ryan McCarthy,The Citadel,RF,0.305,0.412,0.529,0.941,51,187,46,57,10,4,8,38,2,35,51,20,1
3258,Ryan McCarthy,East Tennessee State,C,0.245,0.367,0.33,0.697,36,106,22,26,6,0,1,13,6,15,22,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
2478,Tyler Davis,Sam Houston,DH,0.2,0.36,0.3,0.66,7,20,1,4,2,0,0,4,2,3,7,0,0
3402,Tyler Davis,Fresno State,C,0.556,0.556,0.833,1.389,13,18,8,10,2,0,1,6,0,0,3,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
2490,Payton Allen,Kansas,3B,0.235,0.316,0.294,0.61,27,51,9,12,3,0,0,6,0,6,11,0,1
3408,Payton Allen,Fresno State,2B,0.303,0.368,0.432,0.8,54,185,31,56,13,1,3,28,5,14,30,2,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1140,Jordan Thompson,Texas A&M,CF,0.258,0.4,0.475,0.875,47,120,35,31,8,0,6,31,7,24,33,8,1
3424,Jordan Thompson,LSU,SS,0.286,0.395,0.432,0.827,62,220,54,63,14,0,6,36,6,35,44,3,1
5134,Jordan Thompson,Nicholls,2B,0.0,0.0,0.0,0.0,7,4,2,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
447,Jared Miller,Notre Dame,2B,0.283,0.393,0.428,0.821,45,152,32,43,7,0,5,24,5,24,37,9,3
3478,Jared Miller,Oakland,2B,0.299,0.435,0.486,0.921,52,144,32,43,7,1,6,30,11,27,27,4,3


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1886,Jimmy Sullivan,Virginia,PH,0.222,0.333,0.222,0.555,19,18,2,4,0,0,0,1,1,2,3,0,0
3539,Jimmy Sullivan,UMass Lowell,DH,0.253,0.367,0.308,0.675,31,91,14,23,2,0,1,9,0,17,25,0,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
2683,Connor OBrien,Bradley,1B,0.392,0.456,0.624,1.08,48,189,44,74,14,0,10,44,3,22,48,3,5
3594,Connor OBrien,Seattle,1B,0.298,0.328,0.492,0.82,35,124,18,37,8,2,4,24,1,5,36,2,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
2494,Ryan Callahan,Kansas,RF,0.2,0.338,0.267,0.605,30,60,7,12,1,0,1,7,5,8,15,1,0
3806,Ryan Callahan,Wright State,LF,0.0,0.167,0.0,0.167,29,15,4,0,0,0,0,1,3,0,7,0,1


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1775,Carson Yates,UCLA,CF,0.288,0.346,0.512,0.858,49,170,34,49,10,2,8,33,3,13,41,4,0
4092,Carson Yates,Gardner-Webb,PH,0.346,0.346,0.385,0.731,17,26,2,9,1,0,0,2,0,0,7,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
3183,Tyler Williams,Little Rock,CF,0.302,0.369,0.472,0.841,51,199,45,60,14,4,4,26,5,18,70,21,3
4141,Tyler Williams,Texas Southern,DH,0.222,0.362,0.296,0.658,32,54,11,12,4,0,0,12,6,7,8,4,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
3255,Jackson Green,East Tennessee State,DH,0.269,0.38,0.448,0.828,29,67,13,18,0,0,4,19,0,12,16,0,0
4261,Jackson Green,Tennessee Tech,CF,0.161,0.278,0.258,0.536,39,31,8,5,3,0,0,3,2,3,12,2,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
3406,Ben Newton,Fresno State,2B,0.307,0.402,0.356,0.758,51,163,26,50,6,1,0,17,5,21,31,3,1
4429,Ben Newton,East Carolina,C,0.276,0.378,0.353,0.731,54,156,28,43,9,0,1,19,12,15,22,2,1


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
3520,Hayden Harris,Longwood,2B,0.266,0.371,0.352,0.723,52,199,44,53,11,0,2,23,9,26,42,10,3
4496,Hayden Harris,Charleston Southern,RF,0.315,0.421,0.491,0.912,50,165,32,52,11,0,6,31,10,21,25,5,3


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
2365,Dylan Stewart,Purdue Fort Wayne,C,0.161,0.275,0.184,0.459,40,87,9,14,2,0,0,8,2,12,37,4,0
4509,Dylan Stewart,Charleston Southern,2B,0.136,0.24,0.136,0.376,35,22,5,3,0,0,0,3,2,1,8,3,1


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
4203,Ryan Howe,UMES,DH,0.297,0.35,0.427,0.777,53,185,31,55,9,0,5,40,2,15,34,2,3
4518,Ryan Howe,Purdue,1B,0.321,0.397,0.482,0.879,21,56,9,18,3,0,2,12,2,5,12,2,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
4218,Marcus Brown,UMES,1B,0.192,0.252,0.292,0.544,40,120,12,23,6,0,2,19,1,9,24,1,0
4538,Marcus Brown,Oklahoma State,SS,0.316,0.378,0.441,0.819,64,247,44,78,15,2,4,31,5,21,38,5,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1737,Kyle Hess,Pittsburgh,CF,0.33,0.432,0.66,1.092,29,100,22,33,7,1,8,24,7,11,15,3,1
4676,Kyle Hess,Creighton,3B,0.238,0.377,0.345,0.722,48,168,31,40,10,1,2,24,15,25,57,6,2


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
3632,Miguel Rivera,Delaware State,2B,0.293,0.403,0.419,0.822,45,167,41,49,9,3,2,29,12,20,34,11,1
4725,Miguel Rivera,Indiana State,1B,0.222,0.364,0.389,0.753,6,18,3,4,0,0,1,6,1,3,5,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1290,Michael Pollard,Saint Peter's,CF,0.111,0.245,0.133,0.378,24,45,8,5,1,0,0,6,2,6,13,1,0
4858,Michael Pollard,CSU Bakersfield,CF,0.13,0.31,0.13,0.44,23,23,3,3,0,0,0,2,1,5,11,1,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
4259,Will Long,Tennessee Tech,C,0.225,0.384,0.333,0.717,40,129,22,29,2,0,4,10,6,28,35,0,0
4971,Will Long,Stephen F. Austin,SS,0.0,0.0,0.0,0.0,2,2,0,0,0,0,0,0,0,0,2,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1140,Jordan Thompson,Texas A&M,CF,0.258,0.4,0.475,0.875,47,120,35,31,8,0,6,31,7,24,33,8,1
3424,Jordan Thompson,LSU,SS,0.286,0.395,0.432,0.827,62,220,54,63,14,0,6,36,6,35,44,3,1
5134,Jordan Thompson,Nicholls,2B,0.0,0.0,0.0,0.0,7,4,2,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1973,Luis Rodriguez,Florida Gulf Coast,3B,0.262,0.401,0.462,0.863,43,130,23,34,12,1,4,24,18,13,27,0,0
5337,Luis Rodriguez,Alabama State,RF,0.222,0.417,0.444,0.861,30,18,5,4,1,0,1,6,0,6,3,0,1


Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,H,2B,3b,HR,RBI,HP,BB,SO,SB,CS
1742,Jack Anderson,Pittsburgh,DH,0.28,0.393,0.453,0.846,50,161,26,45,8,1,6,26,3,29,59,0,0
5411,Jack Anderson,Northern Illinois,PH,0.0,0.143,0.0,0.143,7,6,1,0,0,0,0,0,0,1,3,0,0


<div class="alert alert-block alert-info">
<b>NOTES:</b> Above is visual confirmation all the duplicated names are not duplicated records. 
</div>

# Fix names that are not 'First Last' format

## Make list of special names

In [39]:
# loop through `full_names` to see what names are not 'First Last' format
names_to_print = []

for name in full_names:
    if len(name.split()) > 2:
        names_to_print.append(name)

names_to_print[:10] # print sample to have and idea of what needs to be corrected

['Paul Myro IV',
 'Nander De Sedas',
 'Eddie Micheletti Jr.',
 'Efrain Correa Jr.',
 'Christopher Rowan Jr.',
 'Harris Williams III',
 'Brooks Coetzee III',
 'Enrique Bradfield Jr.',
 'Kenny Mallory Jr.',
 'Orlando Salinas Jr.']

## Remove suffixes

In [40]:
# Jr. suffixes and roman numerals need to be removed 
remove = ['I', 'II', 'III', 'IV', 'Jr.', 'Jr']

full_names_cleaned = []

for full_name in full_names:
    if len(full_name.split()) > 2:
        if full_name.split()[-1] in remove:
            for suffix in remove:
                if full_name.split()[-1] == suffix:
                    full_name_cleaned = full_name.replace(' '+suffix, '') # remove suffix from name
                    full_names_cleaned.append(full_name_cleaned) # append
                
                
            #else:
               # full_names_cleaned.append(full_name)     
        else:
            full_names_cleaned.append(full_name)
    else:
        full_names_cleaned.append(full_name)

In [41]:
# reminder of length of names
len(full_names)

5430

In [42]:
# check that new list is same length
len(full_names_cleaned)

5430

## Split names, assign first and last

In [43]:
first_names = []
last_names = []

for full_name in full_names_cleaned:
    if len(full_name.split()) == 2:
        first = full_name.split()[0]
        last = full_name.split()[1]
        
    elif full_name.split()[1] == 'Junior':
        first = full_name.split()[0]
        last = full_name.split()[-1]
        
    elif full_name.split()[0] == 'Naighel':
        first = 'Naighel Alii'
        last = 'Calderon'
        
    elif full_name.split()[1] == 'Criquet':
        first = 'Dylan'
        last = 'Criquet-Danielson'
        
    elif full_name.split()[0] == 'Juston':
        first = 'Nicho'
        last = 'Jordan'
        
    elif full_name.split()[1] == 'Callan':
        first = 'Michael'
        last = 'Callan Moss'
        
    elif full_name.split()[1] == 'Marc':
        first = 'John Marc'
        last = 'Mullins'
    
    elif full_name.split()[1] == 'Kurahashi-Choy':
        first = 'Dylan'
        last = 'Kurahashi-Choy Foo'
        
    elif full_name.split()[0] == 'Gustavo':
        first = 'Gustavo'
        last = 'Nava-Sanchez'
        
    else:
        first_3_of_second = full_name.split(' ', 1)[1][:3]
        if first_3_of_second in ["De ", "Di ", "Van", "St.", "San"]:
            for prefix in ["De ", "Di ", "Van", "St."]:
                if prefix == first_3_of_second:
                    first = full_name.split(' ', 1)[0]
                    last = full_name.split(' ', 1)[1]
                break
    
        else:
            print(full_name)
    first_names.append(first)
    last_names.append(last)

In [44]:
# reminder of lengths
print(len(full_names))
print(len(full_names_cleaned))

5430
5430


In [45]:
# check first_name length
len(first_names)

5430

In [46]:
# check last_name length
len(last_names)

5430

In [47]:
# check the names with more than 1 name
for name in first_names:
    if len(name.split()) >1:
        print(name)

Naighel Alii
John Marc


In [48]:
# check the names with more than one last name
for name in last_names:
    if len(name.split()) >1:
        print(name)

De Sedas
De Goti
De la Cruz
Callan Moss
De Leon
De La Cruz
Kurahashi-Choy Foo
De La Cruz


# Add `First Name` and `Last Name` columns to `all_college` df

In [49]:
all_college['First Name'] = first_names
all_college['Last Name'] = last_names

all_college

Unnamed: 0,Player,Team,POS,AVG,OBP,SLG,OPS,GP,AB,R,...,3b,HR,RBI,HP,BB,SO,SB,CS,First Name,Last Name
0,Max Imhoff,Yale,C,0.320,0.393,0.520,0.913,13,25,2,...,0,1,4,0,3,10,0,1,Max,Imhoff
1,Jimmy Chatfield,Yale,RF,0.303,0.472,0.523,0.995,38,132,30,...,0,8,30,10,34,27,10,4,Jimmy,Chatfield
2,Teddy Hague,Yale,1B,0.292,0.377,0.462,0.839,18,65,10,...,0,3,11,0,10,10,1,0,Teddy,Hague
3,Mason LaPlante,Yale,SS,0.281,0.379,0.379,0.758,38,153,34,...,0,2,16,5,21,24,19,2,Mason,LaPlante
4,Jake Gehri,Yale,C,0.279,0.371,0.519,0.890,38,129,23,...,0,9,26,7,13,29,2,2,Jake,Gehri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5425,Peter Serruto,Indiana,C,0.139,0.200,0.190,0.390,34,79,7,...,0,1,6,1,5,24,0,0,Peter,Serruto
5426,Kip Fougerousse,Indiana,1B,0.111,0.172,0.222,0.394,13,27,3,...,0,1,6,1,1,13,0,0,Kip,Fougerousse
5427,Ethan Vecrumba,Indiana,DH,0.000,0.571,0.000,0.571,16,3,7,...,0,0,1,1,3,3,0,0,Ethan,Vecrumba
5428,Jake Skrine,Indiana,PH,0.000,0.000,0.000,0.000,2,2,0,...,0,0,0,0,0,1,0,0,Jake,Skrine


# Order columns to match `MLB` and `MiLB` DF's

## Loab MLB.columns

In [50]:
MLB_columns = pd.read_pickle("./MLB_df.pkl").columns
MLB_columns

Index(['Team', 'Games Played', 'At Bats', 'Runs', 'Hits', 'Doubles', 'Triples',
       'Home Runs', 'Runs Batted In', 'Walks', 'Strikeouts', 'Stolen Bases',
       'Caught Stealing', 'Batting Average', 'On-Base Percentage',
       'Slugging Percentage', 'On-Base Plus Slugging', 'First Name',
       'Last Name', 'Position'],
      dtype='object')

## Check existing column names

In [51]:
all_college.columns

Index(['Player', 'Team', 'POS', 'AVG', 'OBP', 'SLG', 'OPS', 'GP', 'AB', 'R',
       'H', '2B', '3b', 'HR', 'RBI', 'HP', 'BB', 'SO', 'SB', 'CS',
       'First Name', 'Last Name'],
      dtype='object')

## Put columns in same order as MLB and MiLB tables

In [52]:
# columns to keep
keep = ['Team',  
        'GP',  
        'AB',   
        'R',   
        'H',    
        '2B',    
        '3b',   
        'HR',   
        'RBI',   
        'BB',  
        'SO',   
        'SB',    
        'CS',   
        'AVG',  
        'OBP', 
        'SLG', 
        'OPS', 
        'First Name',
        'Last Name',
        'POS']

In [53]:
college = all_college[keep]
college

Unnamed: 0,Team,GP,AB,R,H,2B,3b,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS,First Name,Last Name,POS
0,Yale,13,25,2,8,2,0,1,4,3,10,0,1,0.320,0.393,0.520,0.913,Max,Imhoff,C
1,Yale,38,132,30,40,5,0,8,30,34,27,10,4,0.303,0.472,0.523,0.995,Jimmy,Chatfield,RF
2,Yale,18,65,10,19,2,0,3,11,10,10,1,0,0.292,0.377,0.462,0.839,Teddy,Hague,1B
3,Yale,38,153,34,43,9,0,2,16,21,24,19,2,0.281,0.379,0.379,0.758,Mason,LaPlante,SS
4,Yale,38,129,23,36,4,0,9,26,13,29,2,2,0.279,0.371,0.519,0.890,Jake,Gehri,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5425,Indiana,34,79,7,11,1,0,1,6,5,24,0,0,0.139,0.200,0.190,0.390,Peter,Serruto,C
5426,Indiana,13,27,3,3,0,0,1,6,1,13,0,0,0.111,0.172,0.222,0.394,Kip,Fougerousse,1B
5427,Indiana,16,3,7,0,0,0,0,1,3,3,0,0,0.000,0.571,0.000,0.571,Ethan,Vecrumba,DH
5428,Indiana,2,2,0,0,0,0,0,0,0,1,0,0,0.000,0.000,0.000,0.000,Jake,Skrine,PH


# Create new `College` df with renamed columns

In [54]:
college.columns = MLB_columns
College = college

In [55]:
# check
College

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,First Name,Last Name,Position
0,Yale,13,25,2,8,2,0,1,4,3,10,0,1,0.320,0.393,0.520,0.913,Max,Imhoff,C
1,Yale,38,132,30,40,5,0,8,30,34,27,10,4,0.303,0.472,0.523,0.995,Jimmy,Chatfield,RF
2,Yale,18,65,10,19,2,0,3,11,10,10,1,0,0.292,0.377,0.462,0.839,Teddy,Hague,1B
3,Yale,38,153,34,43,9,0,2,16,21,24,19,2,0.281,0.379,0.379,0.758,Mason,LaPlante,SS
4,Yale,38,129,23,36,4,0,9,26,13,29,2,2,0.279,0.371,0.519,0.890,Jake,Gehri,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5425,Indiana,34,79,7,11,1,0,1,6,5,24,0,0,0.139,0.200,0.190,0.390,Peter,Serruto,C
5426,Indiana,13,27,3,3,0,0,1,6,1,13,0,0,0.111,0.172,0.222,0.394,Kip,Fougerousse,1B
5427,Indiana,16,3,7,0,0,0,0,1,3,3,0,0,0.000,0.571,0.000,0.571,Ethan,Vecrumba,DH
5428,Indiana,2,2,0,0,0,0,0,0,0,1,0,0,0.000,0.000,0.000,0.000,Jake,Skrine,PH


# Pickle `College` DF

In [56]:
pd.to_pickle(College, "./College_df.pkl")