In [1]:
import os
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from splinter import Browser
import time

roster_str = 'https://www.sports-reference.com/cbb/schools/{team}/{year}.html#all_advanced'

In [2]:
def init_browser():
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

In [3]:
def get_RSCI_rank(x):
    if x is not np.nan:
        rank = int(str(x).split(' ')[0])
        return int(rank)
    else:
        return np.nan


def get_RSCI_year(x):
    if x is not np.nan:
        year = int(str(x).split(' ')[1].replace('(', '').replace(')', ''))
        return int(year)
    else:
        return np.nan
        

def save_roster_data(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'roster_data', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    roster = data[2]
    roster['Inches'] = roster['Height'].apply(lambda x: int(x.split('-')[0]) * 12 + int(x.split('-')[1]))
    roster['PPG'] = roster['Summary'].apply(lambda x: float(x.split(',')[0].split(' ')[0]))
    roster['RBG'] = roster['Summary'].apply(lambda x: float(x.split(',')[1].split(' ')[1]))
    roster['APG'] = roster['Summary'].apply(lambda x: float(x.split(',')[2].split(' ')[1]))
    roster = roster.drop('Summary', axis=1)
    if 'RSCI Top 100' in roster.columns:
        roster['RSCI Rank'] = roster['RSCI Top 100'].apply(lambda x: get_RSCI_rank(x))
        roster['RSCI Year'] = roster['RSCI Top 100'].apply(lambda x: get_RSCI_year(x))
        roster = roster.drop('RSCI Top 100', axis=1)
    else:
        roster['RSCI Rank'] = np.nan
        roster['RSCI Year'] = np.nan
    roster.to_csv(path, index=False)

In [4]:
def save_team_opponent_stats(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'team_opponent_stats', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[3]
    for col in stats.columns[3:]:
        stats[f'{col} Rank'] = np.nan
        stats.loc[0, f'{col} Rank'] = stats.loc[1, f'{col}']
        stats.loc[2, f'{col} Rank'] = stats.loc[3, f'{col}']
    stats = stats.drop([1, 3])
    stats.to_csv(path, index=False)

In [5]:
def save_team_opponent_stats_conf(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'team_opponent_stats_conf', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[4]
    for col in stats.columns[3:]:
        stats[f'{col} Rank'] = np.nan
        stats.loc[0, f'{col} Rank'] = stats.loc[1, f'{col}']
        stats.loc[2, f'{col} Rank'] = stats.loc[3, f'{col}']
    stats = stats.drop([1, 3])
    stats.to_csv(path, index=False)

In [6]:
def save_per_game(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'per_game', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[5]
    stats.to_csv(path, index=False)

In [7]:
def save_totals(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'totals', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[6]
    stats.to_csv(path, index=False)

In [8]:
def save_per_40_min(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'per_40_min', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[7]
    stats.to_csv(path, index=False)

In [9]:
def save_per_100_poss(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'per_100_poss', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[8]
    stats.to_csv(path, index=False)

In [10]:
def save_advanced(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'advanced', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[9]
    stats.to_csv(path, index=False)

In [11]:
def save_conf_per_game(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'conf_per_game', f'{year}',
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[10]
    stats.to_csv(path, index=False)

In [12]:
def save_conf_totals(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'conf_totals', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[11]
    stats.to_csv(path, index=False)

In [13]:
def save_conf_per_40_min(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'conf_per_40_min', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[12]
    stats.to_csv(path, index=False)

In [14]:
def save_conf_per_100_poss(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'conf_per_100_poss', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[13]
    stats.to_csv(path, index=False)

In [15]:
def save_conf_adv(team, year, data):
    path = os.path.join('data', 'NCAA', 'rosters', 'conf_advanced', f'{year}', 
                        f'{year}_{team.abbreviation.lower()}.csv')
    stats = data[14]
    stats.to_csv(path, index=False)

In [16]:
from sportsreference.ncaab.teams import Teams

browser = init_browser()

for year in range(2019, 2020):
    year = str(year)
    print(year)
    for team in Teams(year=year):
        try:
            url = roster_str.format(team=team.abbreviation.lower(), year=year)
            browser.visit(url)
            time.sleep(5)
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            data = pd.read_html(str(soup))
            save_roster_data(team, year, data)
            save_team_opponent_stats(team, year, data)
            save_team_opponent_stats_conf(team, year, data)
            save_per_game(team, year, data)
            save_totals(team, year, data)
            save_per_40_min(team, year, data)
            save_per_100_poss(team, year, data)
            save_advanced(team, year, data)
            save_conf_per_game(team, year, data)
            save_conf_totals(team, year, data)
            save_conf_per_40_min(team, year, data)
            save_conf_per_100_poss(team, year, data)
            save_conf_adv(team, year, data)
        except Exception as e:
            print(year, team.abbreviation.lower(), e)
            
browser.quit()

2011
2011 <sportsreference.ncaab.teams.Team object at 0x10efa5da0> 'float' object has no attribute 'split'
2011 <sportsreference.ncaab.teams.Team object at 0x11ac2f160> invalid literal for int() with base 10: '2008,'
2011 <sportsreference.ncaab.teams.Team object at 0x10efb89e8> 'float' object has no attribute 'split'
2011 <sportsreference.ncaab.teams.Team object at 0x10efc3f28> Message: timeout
  (Session info: chrome=74.0.3729.169)
  (Driver info: chromedriver=2.46.628411 (3324f4c8be9ff2f70a05a30ebc72ffb013e1a71e),platform=Mac OS X 10.13.6 x86_64)

2011 <sportsreference.ncaab.teams.Team object at 0x10efc5e10> Message: timeout
  (Session info: chrome=74.0.3729.169)
  (Driver info: chromedriver=2.46.628411 (3324f4c8be9ff2f70a05a30ebc72ffb013e1a71e),platform=Mac OS X 10.13.6 x86_64)

2011 <sportsreference.ncaab.teams.Team object at 0x10efc9e10> Message: timeout
  (Session info: chrome=74.0.3729.169)
  (Driver info: chromedriver=2.46.628411 (3324f4c8be9ff2f70a05a30ebc72ffb013e1a71e),platf

KeyboardInterrupt: 