# NCAA tournament data - webscraping

In [3]:
#imports for webscraping
from bs4 import BeautifulSoup
from urllib.request import urlopen
#for saving and load data files
import pickle
import urllib3
import re
import requests

In [4]:
### loading bar
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )
        
#MIT License
#Copyright (c) 2016 bureaucratic-labs

### Data Scraping Functions

In [None]:
baseURL = 'https://www.sports-reference.com'
notAdded = []

In [None]:
#get tournament teams by year
#tourneyTeamYear -- dictionary with year as key and value is list of teams in the tourney that year
def getTourneyTeamYear(yearRange):
    print("getting tournament teams by year")
    tourneyTeamYear = {}
    
    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
        
        tourneyTeamYear[year] = {}
    
        url =baseURL+'/cbb/postseason/'+year+'-ncaa.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        for tround in soup.findAll('div', attrs={'class':'round'}):
            for game in tround.findAll('div',recursive=False):
                gameDetails = game.findAll('a')
                if len(gameDetails) > 3:
                    schools = [gameDetails[0].text.strip(),gameDetails[2].text.strip()]
                    urls = [gameDetails[0]["href"],gameDetails[2]["href"]]
                    if schools[0] not in tourneyTeamYear[year]:
                        tourneyTeamYear[year][schools[0]] = urls[0]
                    if schools[1] not in tourneyTeamYear[year]:
                        tourneyTeamYear[year][schools[1]] = urls[1]

        print("done")        
        
    print("done")
    return tourneyTeamYear

In [None]:
#get teams played by tournament teams by year
#allTeamYear
def getAllTeamYear(yearRange,tourneyTeamYear):
    print("getting teams played by tournament teams by year")
    allTeamYear = {}
    
    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
    
        allTeamYear[year] = {}
    
        for key, value in log_progress(tourneyTeamYear[year].items(),every=1):
            if(key not in allTeamYear[year]):
                allTeamYear[year][key] = value
            teamURL =baseURL+value
            teamhtml = urlopen(teamURL)
            teamsoup = BeautifulSoup(teamhtml, 'html.parser')
            gameSchedule = teamsoup.find('div', attrs={'id':'inner_nav'}).findAll('li')
            gameScheduleLink = gameSchedule[2].find('a')
            gameScheduleLink = gameScheduleLink["href"]
            
            gameScheduleURL =baseURL+gameScheduleLink
            gameSchedulehtml = urlopen(gameScheduleURL)
            gameSchedulesoup = BeautifulSoup(gameSchedulehtml, 'html.parser')
            tableRows = gameSchedulesoup.find('table', attrs={'id':'schedule'}).find('tbody').findAll('tr')
            for row in tableRows:
                team = row.findAll('td')
                if year not in ['2015','2016','2017']:
                    if team == None or len(team) < 4:
                        continue
                    team = team[3].find('a')
                    if(team == None):
                        continue
                    teamName = team.text.strip()
                    teamLink = team["href"]
                    if(teamName not in allTeamYear[year]):
                        allTeamYear[year][teamName] = teamLink
                else:
                    if team == None or len(team) < 6:
                        continue
                    team = team[5].find('a')
                    if(team == None):
                        continue
                    teamName = team.text.strip()
                    teamLink = team["href"]
                    if(teamName not in allTeamYear[year]):
                        allTeamYear[year][teamName] = teamLink
        
    print('done')
    return allTeamYear

In [None]:
#get team stats for all teams (tournament team + the teams they played)
#teamYearData --  dictionary with year as key and value is a dictionary with team as key and value in the form of [[team stats],[opponent stats]] 
#team/oppenent stats: [0G, 1MP, 2FG, 3FGA, 4FG%, 52P, 62PA, 72P%, 83P, 93PA, 103P%, 11FT, 12FTA, 13FT%, 14ORB, 15DRB, 16TRB, 17AST, 18STL, 19BLK, 20TOV, 21PF, 22PTS, 23PTS/G]
#example: '2000': {'Duke': [[34, '', 1045, 2172, 0.481, 761, 1430, 0.532, 284, 742, 0.383, 618, 833, 0.742, 453, 860, 1313, 584, 333, 191, 480, 552, 2992, 88.0], [34, '', 934, 2238, 0.417, 737, 1686, 0.437, 197, 552, 0.357, 360, 537, 0.67, 526, 757, 1283, 472, 240, 123, 607, 690, 2425, 71.3]]}

def getTeamYearData(yearRange,allTeamYear):
    print("getting team stats for all teams (tournament team and the teams they played)")
    teamYearData = {}

    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
    
        teamYearData[year] = {}
    
    
        for key, value in log_progress(allTeamYear[year].items(),every=1):
            if(key in teamYearData[year]):
                continue
            teamURL =baseURL+value
            teamhtml = urlopen(teamURL)
            teamsoup = BeautifulSoup(teamhtml, 'html.parser')
            if(teamsoup.find('table', attrs={'id':'team_stats'})==None):
                notAdded.append((key,year))
                continue
            table = teamsoup.find('table', attrs={'id':'team_stats'}).find('tbody')
            teamStats = []
            for row in table.findAll('tr'):
                temp = []
                if row.has_attr("class"):
                    continue
                for stat in row.findAll('td'):
                    temp.append(stat.text.strip())
                teamStats.append(temp)
            teamYearData[year][key] = teamStats
                  
    print('done')
    
    
    #reformat teamYearData
    #empty value represented with ''
    print('reformatting data...')

    for i in range(yearRange[0],yearRange[1]+1):
        year = str(i)
        for team in teamYearData[year]:
            newStats = []
            for stats in teamYearData[year][team]:
                index = 0
                newStat = []
                for stat in stats:
                    if stat != '':
                        if index in [4,7,10,13,23]:
                            newStat.append(float(stat))
                        else:
                            newStat.append(int(stat)) 
                    else:
                        newStat.append('')
                    index +=1
                newStats.append(newStat)
            teamYearData[year][team] = newStats
    
    print('done')
    return teamYearData

In [None]:
#get game data for all games played by tournament teams
#dictionary with year as key and value is a list of lists containing [team1,team2,overall score]
#example: '2000': [['Duke', 'Lamar', 137], ['Kansas', 'DePaul', 158]]

def getGameYearData(yearRange,tourneyTeamYear):
    print("getting game data for all games played by tournament teams")
    gameYearDataWithDuplicates = {}
    gameYearData = {}

    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
    
        gameYearDataWithDuplicates[year] = []
    
        for key, value in log_progress(tourneyTeamYear[year].items(),every=1):
            teamURL = baseURL+value
            teamhtml = urlopen(teamURL)
            teamsoup = BeautifulSoup(teamhtml, 'html.parser')
            gameSchedule = teamsoup.find('div', attrs={'id':'inner_nav'}).findAll('li')
            gameScheduleLink = gameSchedule[2].find('a')
            gameScheduleLink = gameScheduleLink["href"]
            
            gameScheduleURL =baseURL+gameScheduleLink
            gameSchedulehtml = urlopen(gameScheduleURL)
            gameSchedulesoup = BeautifulSoup(gameSchedulehtml, 'html.parser')
            tableRows = gameSchedulesoup.find('table', attrs={'id':'schedule'}).find('tbody').findAll('tr')
            for row in tableRows:
                team = row.findAll('td')
                if year not in ['2015','2016','2017']:
                    if team == None or len(team) < 8:
                        continue
                    opponent = team[3].find('a')
                    if(opponent == None):
                        continue
                    opponentTeamName = opponent.text.strip()
            
                    score1 = team[6].text.strip()
                    score2 = team[7].text.strip()
            
                    game = [key,opponentTeamName,int(score1)+int(score2)]
                    gameYearDataWithDuplicates[year].append(game)
                else:
                    if team == None or len(team) < 10:
                        continue
                    opponent = team[5].find('a')
                    if(opponent == None):
                        continue
                    opponentTeamName = opponent.text.strip()
            
                    score1 = team[8].text.strip()
                    score2 = team[9].text.strip()
            
                    game = [key,opponentTeamName,int(score1)+int(score2)]
                    gameYearDataWithDuplicates[year].append(game)
        
        
    print('done')

    print('deleting duplicate games...')
    #delete duplicates from gameYearData 
    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
    
        gameYearData[year] = []
    
        for game in gameYearDataWithDuplicates[year]:
            duplicate = False
            for gameCompare in gameYearData[year]:
                if game[2] == gameCompare[2]:
                    if (game[0]==gameCompare[0] and game[1]==gameCompare[1]) or (game[1]==gameCompare[0] and game[0]==gameCompare[1]):
                        duplicate = True
            if not duplicate:
                gameYearData[year].append(game)

    print('done')
    return gameYearData

In [None]:
#get game data for all games played in the tournament
#dictionary with year as key and value is a list of lists containing [team1,team2,overall score] for everygame in the tournament
#example: '2000': [['Duke', 'Lamar', 137], ['Kansas', 'DePaul', 158]]

def getTournamentGameData(yearRange):

    tournamentGameData = {}
    
    for i in range(yearRange[0],yearRange[1]+1):
        year = str(i)
        print(year, end =" ")
    
        tournamentGameData[year] = []
    
        url =baseURL+'/cbb/postseason/'+year+'-ncaa.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        numSchoolsAdded = 1
        for tround in soup.findAll('div', attrs={'class':'round'}):
            for game in tround.findAll('div',recursive=False):
                gameDetails = game.findAll('a')
                if len(gameDetails) > 3:
                    schools = [gameDetails[0].text.strip(),gameDetails[2].text.strip()]
                    gameData = [schools[0],schools[1],int(gameDetails[1].text.strip())+int(gameDetails[3].text.strip())]
                    tournamentGameData[year].append(gameData)
        print("done")
        
    print("done")            
    return tournamentGameData

In [5]:
# gather all team ID's from ESPN.com

def gather_teamIds():
    http = urllib3.PoolManager()
    page = http.request("GET", "http://www.espn.com/mens-college-basketball/teams")
    soup = BeautifulSoup(page.data, "html.parser")
    paragraphs = soup.find_all("h5")
    team_ids = {}
    for h5 in paragraphs:
        try:
            team = re.search(r'>[A-Za-z| |\'|-]+</a>',str(h5)).group(0)[1:-4]
            match = re.search(r'/_/id/[0-9]+/',str(h5)).group(0)[6:-1]
            team_ids[team] = (match)
        except AttributeError:
            pass
    return team_ids

In [6]:
def gather_all_Reg_stats():
    team_ids = gather_teamIds()
    years = [2002 + i for i in range(16)]
    team_games = {}

    for year in years:
        print(year)
        for team in team_ids.values():
            games = []
            http = urllib3.PoolManager()
            page = http.request("GET", "http://www.espn.com/mens-college-basketball/team/schedule/_/id/{}/year/{}".format(team,year))
            soup = BeautifulSoup(page.data, "html.parser")
            paragraphs = soup.find_all("table")
            for tb in paragraphs:
                match = re.findall(r'gameId/[0-9]+">[0-9]+-[0-9]+[ OT]*<',str(tb))
                opponent = re.findall(r'/_/id/[0-9]+/',str(tb))
                for game,opp in zip(match,opponent[::2]):
                    games.append(tuple([game.split('>')[1][:-1], opp[6:-1]]))
            team_games[str(team)+str(year)] = games

    

In [7]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time

def game_key(team1,team2):
    return team1 + team2

def get_over_under(team1, team2):
    ou_dict = {'U':1,'O':0}
    browser = webdriver.Firefox()
    browser.get('http://www.oddsshark.com/ncaab/database')
    team = browser.find_element_by_id("team-search-h2h")

    team.send_keys(team1)
    opponent = browser.find_element_by_id("opponent-search-h2h")

    opponent.send_keys(team2)
    browser.find_element_by_id("games-30-h2h").click()
    Select(browser.find_element_by_id('chalk-select-game-type-h2h')).select_by_value('PST')
    browser.find_element_by_id("location-any-h2h").click()
    Select(browser.find_element_by_id('chalk-select-odds-h2h')).select_by_value('ANY')
    browser.find_element_by_id("submit-h2h").click()
    soup = BeautifulSoup(browser.page_source, "html.parser")
    page = soup.find_all("table")
    page = str(page[1]).split("</td>")
    tables = [page[x:x+10] for x in range(0,len(page)-1,10)]
    games = []
    for table in tables:
        date = table[0][-12:]
        if date[0] == '>':
            date = date[1:]
        if len(table[-1]) > 4:
            games.append((team1, team2, date,table[-2][4:],ou_dict[table[-1][-1]]))
        
    browser.close()
    return games

### Data Scraping

In [None]:
yearRange = [1995,2017]

In [None]:
tourneyTeamYear = getTourneyTeamYear(yearRange) #dictionary with year as key and value is list of teams in the tourney that year
#done for 1995-2017

In [None]:
allTeamYear = getAllTeamYear(yearRange,tourneyTeamYear) #dictionary -- all teams played by tournament teams by year
#done for 1995-2017

In [None]:
gameYearData = getGameYearData(yearRange,tourneyTeamYear)#dictionary with year as key and value is a list of lists containing [team1,team2,overall score]
#example: '2000': [['Duke', 'Lamar', 137], ['Kansas', 'DePaul', 158]]

In [None]:
teamYearData = getTeamYearData(yearRange,allTeamYear)#teamYearData --  dictionary with year as key and value is a dictionary with team as key and value in the form of [[team stats],[opponent stats]] 
#team/oppenent stats: [0G, 1MP, 2FG, 3FGA, 4FG%, 52P, 62PA, 72P%, 83P, 93PA, 103P%, 11FT, 12FTA, 13FT%, 14ORB, 15DRB, 16TRB, 17AST, 18STL, 19BLK, 20TOV, 21PF, 22PTS, 23PTS/G]


In [None]:
tournamentGameData = getTournamentGameData(yearRange)#dictionary with year as key and value is a list of lists containing [team1,team2,overall score] for everygame in the tournament

In [None]:
d = {}
d[game_key("Colorado State","Colorado")] = get_over_under("Colorado State","Colorado")
d

### Testing Scraped Data

In [None]:
testYear = '2015'

In [None]:
notAdded

In [None]:
len(tourneyTeamYear[testYear])

In [None]:
len(allTeamYear[testYear])

In [None]:
len(tournamentGameData[testYear])

In [None]:
#for team, value in teamYearData[testYear].items():
    #print(team,value)
len(teamYearData[testYear])    

In [None]:
#for game in gameYearData[testYear]:
#    print(game)
len(gameYearData[testYear])

In [None]:
gather_teamIds()

### Saving Scraped Data

In [None]:
#save dicts to file
import pickle

In [None]:
pickle_out = open("tourneyTeamYear.pickle","wb")
pickle.dump(tourneyTeamYear, pickle_out)
pickle_out.close()

In [None]:
pickle_out = open("allTeamYear.pickle","wb")
pickle.dump(allTeamYear, pickle_out)
pickle_out.close()

In [None]:
pickle_out = open("gameYearData.pickle","wb")
pickle.dump(gameYearData, pickle_out)
pickle_out.close()

In [None]:
pickle_out = open("teamYearData.pickle","wb")
pickle.dump(teamYearData, pickle_out)
pickle_out.close()

In [None]:
pickle_out = open("tournamentGameData.pickle","wb")
pickle.dump(tournamentGameData, pickle_out)
pickle_out.close()

In [None]:
with open("regular_season_games_espn.pkl","wb") as f:
    pickle.dump(team_games,f)

In [20]:
with open("tournamentGameData.pickle",'rb') as f:
    tournament_data = pickle.load(f)
with open("teamYearData.pickle","rb") as f:
    team_stats = pickle.load(f)
year = "2011"
stats = team_stats[year]
all_over_under = []
for game in tournament_data[year]:
    try:
        ou = get_over_under(game[0],game[1])
        print(ou)
        all_over_under.append(ou)
    except:
        pass
    team = stats[game[0]][0]
    opponent = stats[game[0]][1]

[('Ohio State', 'UTSA', 'Mar 18, 2011', '140.5', 1)]
[('George Mason', 'Villanova', 'Mar 18, 2011', '135.0', 1)]
[('West Virginia', 'Clemson', 'Mar 17, 2011', '123.5', 0), ('West Virginia', 'Clemson', 'Mar 29, 2007', '138.0', 0)]
[('Kentucky', 'Princeton', 'Mar 17, 2011', '132.5', 1)]
[('Xavier', 'Marquette', 'Mar 18, 2011', '141.0', 1)]
[('Syracuse', 'Indiana State', 'Mar 18, 2011', '129.0', 0)]
[('Washington', 'Georgia', 'Mar 18, 2011', '146.0', 1)]
[('Ohio State', 'George Mason', 'Mar 20, 2011', '136.5', 0)]
[('Marquette', 'Syracuse', 'Mar 30, 2013', '126.5', 1), ('Marquette', 'Syracuse', 'Mar 20, 2011', '138.0', 1)]
[('Washington', 'UNC', 'Mar 21, 2016', '149.5', 0), ('Washington', 'UNC', 'Mar 15, 2016', '167.5', 0), ('Washington', 'UNC', 'Mar 19, 2013', '148.0', 0), ('Washington', 'UNC', 'Mar 27, 2012', '144.5', 1), ('Washington', 'UNC', 'Mar 20, 2012', '153.5', 0), ('Washington', 'UNC', 'Mar 16, 2012', '148.0', 1), ('Washington', 'UNC', 'Mar 13, 2012', '157.0', 1), ('Washington',

In [34]:
with open("all_over_under_2011.pkl",'wb') as f:
    pickle.dump(all_over_under, f)

In [30]:
print(len(all_over_under))

39


In [50]:
years = ["2011","2012","2013","2014","2015","2016"]
all_over_under = []
valid_games = []
for year in years:
    with open("all_over_under_{}.pkl".format(year),'rb') as f:
        all_over_under += pickle.load(f)
all_over_under = all_over_under[1:]
for team in all_over_under:
    if len(item) > 1:
        tournament_game = []
        for game in team:
            if game[2][-4:] in years:
                tournament_game.append(game)
        if len(tournament_game) < 2:
            valid_games.append(tournament_game)

In [54]:
with open("valid_classification_games.pkl",'wb') as f:
    pickle.dump(valid_games,f)

In [None]:
#how to load
#pickle_in = open("dict.pickle","rb")
#example_dict = pickle.load(pickle_in)