# NCAA tournament data - webscraping

In [1]:
#imports for webscraping
from bs4 import BeautifulSoup
from urllib.request import urlopen
#for saving and load data files
import pickle

In [2]:
### loading bar
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

### Data Scraping Functions

In [3]:
baseURL = 'https://www.sports-reference.com'
notAdded = []

In [4]:
#get tournament teams by year
#tourneyTeamYear -- dictionary with year as key and value is list of teams in the tourney that year
def getTourneyTeamYear(yearRange):
    print("getting tournament teams by year")
    tourneyTeamYear = {}
    
    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
        
        tourneyTeamYear[year] = {}
    
        url =baseURL+'/cbb/postseason/'+year+'-ncaa.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        for tround in soup.findAll('div', attrs={'class':'round'}):
            for game in tround.findAll('div',recursive=False):
                gameDetails = game.findAll('a')
                if len(gameDetails) > 3:
                    schools = [gameDetails[0].text.strip(),gameDetails[2].text.strip()]
                    urls = [gameDetails[0]["href"],gameDetails[2]["href"]]
                    if schools[0] not in tourneyTeamYear[year]:
                        tourneyTeamYear[year][schools[0]] = urls[0]
                    if schools[1] not in tourneyTeamYear[year]:
                        tourneyTeamYear[year][schools[1]] = urls[1]

        print("done")        
        
    print("done")
    return tourneyTeamYear

In [5]:
#get teams played by tournament teams by year
#allTeamYear
def getAllTeamYear(yearRange,tourneyTeamYear):
    print("getting teams played by tournament teams by year")
    allTeamYear = {}
    
    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
    
        allTeamYear[year] = {}
    
        for key, value in log_progress(tourneyTeamYear[year].items(),every=1):
            if(key not in allTeamYear[year]):
                allTeamYear[year][key] = value
            teamURL =baseURL+value
            teamhtml = urlopen(teamURL)
            teamsoup = BeautifulSoup(teamhtml, 'html.parser')
            gameSchedule = teamsoup.find('div', attrs={'id':'inner_nav'}).findAll('li')
            gameScheduleLink = gameSchedule[2].find('a')
            gameScheduleLink = gameScheduleLink["href"]
            
            gameScheduleURL =baseURL+gameScheduleLink
            gameSchedulehtml = urlopen(gameScheduleURL)
            gameSchedulesoup = BeautifulSoup(gameSchedulehtml, 'html.parser')
            tableRows = gameSchedulesoup.find('table', attrs={'id':'schedule'}).find('tbody').findAll('tr')
            for row in tableRows:
                team = row.findAll('td')
                if year not in ['2015','2016','2017']:
                    if team == None or len(team) < 4:
                        continue
                    team = team[3].find('a')
                    if(team == None):
                        continue
                    teamName = team.text.strip()
                    teamLink = team["href"]
                    if(teamName not in allTeamYear[year]):
                        allTeamYear[year][teamName] = teamLink
                else:
                    if team == None or len(team) < 6:
                        continue
                    team = team[5].find('a')
                    if(team == None):
                        continue
                    teamName = team.text.strip()
                    teamLink = team["href"]
                    if(teamName not in allTeamYear[year]):
                        allTeamYear[year][teamName] = teamLink
        
    print('done')
    return allTeamYear

In [6]:
#get team stats for all teams (tournament team + the teams they played)
#teamYearData --  dictionary with year as key and value is a dictionary with team as key and value in the form of [[team stats],[opponent stats]] 
#team/oppenent stats: [0G, 1MP, 2FG, 3FGA, 4FG%, 52P, 62PA, 72P%, 83P, 93PA, 103P%, 11FT, 12FTA, 13FT%, 14ORB, 15DRB, 16TRB, 17AST, 18STL, 19BLK, 20TOV, 21PF, 22PTS, 23PTS/G]
#example: '2000': {'Duke': [[34, '', 1045, 2172, 0.481, 761, 1430, 0.532, 284, 742, 0.383, 618, 833, 0.742, 453, 860, 1313, 584, 333, 191, 480, 552, 2992, 88.0], [34, '', 934, 2238, 0.417, 737, 1686, 0.437, 197, 552, 0.357, 360, 537, 0.67, 526, 757, 1283, 472, 240, 123, 607, 690, 2425, 71.3]]}

def getTeamYearData(yearRange,allTeamYear):
    print("getting team stats for all teams (tournament team and the teams they played)")
    teamYearData = {}

    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
    
        teamYearData[year] = {}
    
    
        for key, value in log_progress(allTeamYear[year].items(),every=1):
            if(key in teamYearData[year]):
                continue
            teamURL =baseURL+value
            teamhtml = urlopen(teamURL)
            teamsoup = BeautifulSoup(teamhtml, 'html.parser')
            if(teamsoup.find('table', attrs={'id':'team_stats'})==None):
                notAdded.append((key,year))
                continue
            table = teamsoup.find('table', attrs={'id':'team_stats'}).find('tbody')
            teamStats = []
            for row in table.findAll('tr'):
                temp = []
                if row.has_attr("class"):
                    continue
                for stat in row.findAll('td'):
                    temp.append(stat.text.strip())
                teamStats.append(temp)
            teamYearData[year][key] = teamStats
                  
    print('done')
    
    
    #reformat teamYearData
    #empty value represented with ''
    print('reformatting data...')

    for i in range(yearRange[0],yearRange[1]+1):
        year = str(i)
        for team in teamYearData[year]:
            newStats = []
            for stats in teamYearData[year][team]:
                index = 0
                newStat = []
                for stat in stats:
                    if stat != '':
                        if index in [4,7,10,13,23]:
                            newStat.append(float(stat))
                        else:
                            newStat.append(int(stat)) 
                    else:
                        newStat.append('')
                    index +=1
                newStats.append(newStat)
            teamYearData[year][team] = newStats
    
    print('done')
    return teamYearData

In [7]:
#get game data for all games played by tournament teams
#dictionary with year as key and value is a list of lists containing [team1,team2,overall score]
#example: '2000': [['Duke', 'Lamar', 137], ['Kansas', 'DePaul', 158]]

def getGameYearData(yearRange,tourneyTeamYear):
    print("getting game data for all games played by tournament teams")
    gameYearDataWithDuplicates = {}
    gameYearData = {}

    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
        print(year, end=" ")
    
        gameYearDataWithDuplicates[year] = []
    
        for key, value in log_progress(tourneyTeamYear[year].items(),every=1):
            teamURL = baseURL+value
            teamhtml = urlopen(teamURL)
            teamsoup = BeautifulSoup(teamhtml, 'html.parser')
            gameSchedule = teamsoup.find('div', attrs={'id':'inner_nav'}).findAll('li')
            gameScheduleLink = gameSchedule[2].find('a')
            gameScheduleLink = gameScheduleLink["href"]
            
            gameScheduleURL =baseURL+gameScheduleLink
            gameSchedulehtml = urlopen(gameScheduleURL)
            gameSchedulesoup = BeautifulSoup(gameSchedulehtml, 'html.parser')
            tableRows = gameSchedulesoup.find('table', attrs={'id':'schedule'}).find('tbody').findAll('tr')
            for row in tableRows:
                team = row.findAll('td')
                if year not in ['2015','2016','2017']:
                    if team == None or len(team) < 8:
                        continue
                    opponent = team[3].find('a')
                    if(opponent == None):
                        continue
                    opponentTeamName = opponent.text.strip()
            
                    score1 = team[6].text.strip()
                    score2 = team[7].text.strip()
            
                    game = [key,opponentTeamName,int(score1)+int(score2)]
                    gameYearDataWithDuplicates[year].append(game)
                else:
                    if team == None or len(team) < 10:
                        continue
                    opponent = team[5].find('a')
                    if(opponent == None):
                        continue
                    opponentTeamName = opponent.text.strip()
            
                    score1 = team[8].text.strip()
                    score2 = team[9].text.strip()
            
                    game = [key,opponentTeamName,int(score1)+int(score2)]
                    gameYearDataWithDuplicates[year].append(game)
        
        
    print('done')

    print('deleting duplicate games...')
    #delete duplicates from gameYearData 
    for i in range(yearRange[0],yearRange[1]+1):
    
        year = str(i)
    
        gameYearData[year] = []
    
        for game in gameYearDataWithDuplicates[year]:
            duplicate = False
            for gameCompare in gameYearData[year]:
                if game[2] == gameCompare[2]:
                    if (game[0]==gameCompare[0] and game[1]==gameCompare[1]) or (game[1]==gameCompare[0] and game[0]==gameCompare[1]):
                        duplicate = True
            if not duplicate:
                gameYearData[year].append(game)

    print('done')
    return gameYearData

In [13]:
#get game data for all games played in the tournament
#dictionary with year as key and value is a list of lists containing [team1,team2,overall score] for everygame in the tournament
#example: '2000': [['Duke', 'Lamar', 137], ['Kansas', 'DePaul', 158]]

def getTournamentGameData(yearRange):

    tournamentGameData = {}
    
    for i in range(yearRange[0],yearRange[1]+1):
        year = str(i)
        print(year, end =" ")
    
        tournamentGameData[year] = []
    
        url =baseURL+'/cbb/postseason/'+year+'-ncaa.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        numSchoolsAdded = 1
        for tround in soup.findAll('div', attrs={'class':'round'}):
            for game in tround.findAll('div',recursive=False):
                gameDetails = game.findAll('a')
                if len(gameDetails) > 3:
                    schools = [gameDetails[0].text.strip(),gameDetails[2].text.strip()]
                    gameData = [schools[0],schools[1],int(gameDetails[1].text.strip())+int(gameDetails[3].text.strip())]
                    tournamentGameData[year].append(gameData)
        print("done")
        
    print("done")            
    return tournamentGameData

### Data Scraping

In [8]:
yearRange = [1995,2017]

In [9]:
tourneyTeamYear = getTourneyTeamYear(yearRange) #dictionary with year as key and value is list of teams in the tourney that year
#done for 1995-2017

getting tournament teams by year
1995 done
1996 done
1997 done
1998 done
1999 done
2000 done
2001 done
2002 done
2003 done
2004 done
2005 done
2006 done
2007 done
2008 done
2009 done
2010 done
2011 done
2012 done
2013 done
2014 done
2015 done
2016 done
2017 done
done


In [10]:
allTeamYear = getAllTeamYear(yearRange,tourneyTeamYear) #dictionary -- all teams played by tournament teams by year
#done for 1995-2017

getting teams played by tournament teams by year
1995 

1996 

1997 

1998 

1999 

2000 

2001 

2002 

2003 

2004 

2005 

2006 

2007 

2008 

2009 

2010 

2011 

2012 

2013 

2014 

2015 

2016 

2017 

done


In [11]:
gameYearData = getGameYearData(yearRange,tourneyTeamYear)#dictionary with year as key and value is a list of lists containing [team1,team2,overall score]
#example: '2000': [['Duke', 'Lamar', 137], ['Kansas', 'DePaul', 158]]

getting game data for all games played by tournament teams
1995 

1996 

1997 

1998 

1999 

2000 

2001 

2002 

2003 

2004 

2005 

2006 

2007 

2008 

2009 

2010 

2011 

2012 

2013 

2014 

2015 

2016 

2017 

done
deleting duplicate games...
done


In [12]:
teamYearData = getTeamYearData(yearRange,allTeamYear)#teamYearData --  dictionary with year as key and value is a dictionary with team as key and value in the form of [[team stats],[opponent stats]] 
#team/oppenent stats: [0G, 1MP, 2FG, 3FGA, 4FG%, 52P, 62PA, 72P%, 83P, 93PA, 103P%, 11FT, 12FTA, 13FT%, 14ORB, 15DRB, 16TRB, 17AST, 18STL, 19BLK, 20TOV, 21PF, 22PTS, 23PTS/G]


getting team stats for all teams (tournament team and the teams they played)
1995 

1996 

1997 

1998 

1999 

2000 

2001 

2002 

2003 

2004 

2005 

2006 

2007 

2008 

2009 

2010 

2011 

2012 

2013 

2014 

2015 

2016 

2017 

done
reformatting data...
done


In [14]:
tournamentGameData = getTournamentGameData(yearRange)#dictionary with year as key and value is a list of lists containing [team1,team2,overall score] for everygame in the tournament

1995 done
1996 done
1997 done
1998 done
1999 done
2000 done
2001 done
2002 done
2003 done
2004 done
2005 done
2006 done
2007 done
2008 done
2009 done
2010 done
2011 done
2012 done
2013 done
2014 done
2015 done
2016 done
2017 done
done


### Testing Scraped Data

In [15]:
testYear = '2015'

In [16]:
notAdded

[]

In [17]:
len(tourneyTeamYear[testYear])

64

In [18]:
len(allTeamYear[testYear])

354

In [19]:
len(tournamentGameData[testYear])

63

In [20]:
#for team, value in teamYearData[testYear].items():
    #print(team,value)
len(teamYearData[testYear])    

354

In [21]:
#for game in gameYearData[testYear]:
#    print(game)
len(gameYearData[testYear])

1894

### Saving Scraped Data

In [22]:
#save dicts to file
import pickle

In [23]:
pickle_out = open("tourneyTeamYear.pickle","wb")
pickle.dump(tourneyTeamYear, pickle_out)
pickle_out.close()

In [24]:
pickle_out = open("allTeamYear.pickle","wb")
pickle.dump(allTeamYear, pickle_out)
pickle_out.close()

In [25]:
pickle_out = open("gameYearData.pickle","wb")
pickle.dump(gameYearData, pickle_out)
pickle_out.close()

In [26]:
pickle_out = open("teamYearData.pickle","wb")
pickle.dump(teamYearData, pickle_out)
pickle_out.close()

In [27]:
pickle_out = open("tournamentGameData.pickle","wb")
pickle.dump(tournamentGameData, pickle_out)
pickle_out.close()

In [None]:
#how to load
#pickle_in = open("dict.pickle","rb")
#example_dict = pickle.load(pickle_in)