In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta as td
import re
import time
import os

# Use Basketball Reference to Scrape Box Scores

In [2]:
base_url = 'http://www.basketball-reference.com'
box_url = 'http://www.basketball-reference.com/boxscores/'
response = requests.get(box_url)
soup = BeautifulSoup(response.content, 'lxml')

## Write a function that gets the dates as strings between the date and today

In [795]:
def get_string_dates(d1):
    d1 = date(*d1)
    d2 = date.today()
    day_lst = [str(d1 + td(days=day)) for day in xrange((d2-d1).days)]
    day_s_lst = [''.join((str(day)+str(0)).split('-')) for day in day_lst]
    return day_s_lst

In [823]:
get_string_dates((2016, 3, 15))[:10]

['201603150',
 '201603160',
 '201603170',
 '201603180',
 '201603190',
 '201603200',
 '201603210',
 '201603220',
 '201603230',
 '201603240']

## Improve upon the formatting

In [820]:
def get_nice_string_dates(d1):
    d1 = date(*d1)
    d2 = date.today()
    day_lst = [str(d1 + td(days=day)) for day in xrange((d2-d1).days)]
    day_s_lst = [''.join(str(day)) for day in day_lst]
    return day_s_lst

In [824]:
get_nice_string_dates((2016, 3, 15))[:10]

['2016-03-15',
 '2016-03-16',
 '2016-03-17',
 '2016-03-18',
 '2016-03-19',
 '2016-03-20',
 '2016-03-21',
 '2016-03-22',
 '2016-03-23',
 '2016-03-24']

## Get our function to match Basketball References format

#### We want the links in descending order so we can always append newest data to the bottom

In [2]:
def get_url_string_dates(d1):
    d1 = date(*d1)
    d2 = date.today()    
    days_lst = [d1+td(days=i) for i in xrange((d2-d1).days)]
    urls = ['index.cgi?month='+str(day.month)+'&day='+str(day.day)+'&year='+str(day.year) for day in days_lst]
    urls = [box_url+url for url in urls][1:]
    day_lst2 = [str(d1 + td(days=day)) for day in xrange((d2-d1).days)]
    day_s_lst2 = [''.join(str(day)) for day in day_lst2][1:]
    #don't want todays hence [1:]
    return urls, day_s_lst2

In [835]:
get_url_string_dates((2016, 3, 15))

(['http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=16&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=17&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=18&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=19&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=20&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=21&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=22&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=23&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=24&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=25&year=2016',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=3&day=26&year=2016',
  'http://www.basketball-reference.com/boxscores/index

#### Code for getting the URLS for future scraping (If we want to scrape more years)

In [3]:
def get_url_string_dates(d1, d2):
    d1 = date(*d1)
    d2 = date(*d2)    
    days_lst = [d1+td(days=i) for i in xrange((d2-d1).days)]
    urls = ['index.cgi?month='+str(day.month)+'&day='+str(day.day)+'&year='+str(day.year) for day in days_lst]
    urls = [box_url+url for url in urls][1:]
    day_lst2 = [str(d1 + td(days=day)) for day in xrange((d2-d1).days)]
    day_s_lst2 = [''.join(str(day)) for day in day_lst2][1:]
    #don't want todays hence [1:]
    return urls, day_s_lst2

In [4]:
#get_url_string_dates((2015, 10, 26))
#get_url_string_dates((2013, 10, 26),(2014, 4, 17))
#get_url_string_dates((2014, 10, 27),(2015, 4, 16))[:5]
get_url_string_dates((2013, 10, 29),(2014, 4, 18))[:5]

(['http://www.basketball-reference.com/boxscores/index.cgi?month=10&day=30&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=10&day=31&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=1&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=2&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=3&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=4&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=5&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=6&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=7&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=8&year=2013',
  'http://www.basketball-reference.com/boxscores/index.cgi?month=11&day=9&year=2013',
  'http://www.basketball-reference.com/boxscores/ind

#### Find the later half of all the games today

In [5]:
urls = soup.find_all(href=re.compile('/boxscores/2'))
urls

[<a href="/boxscores/201604090ATL.html">Final</a>,
 <a href="/boxscores/201604090CHI.html">Final</a>,
 <a href="/boxscores/201604090MEM.html">Final</a>,
 <a href="/boxscores/201604090NOP.html">Final</a>,
 <a href="/boxscores/201604090POR.html">Final</a>,
 <a href="/boxscores/201604090SAC.html">Final</a>]

#### Combining the base url to form the full url for the boxscores today

In [6]:
game_urls = [base_url + url['href'] for url in urls]
game_urls

['http://www.basketball-reference.com/boxscores/201604090ATL.html',
 'http://www.basketball-reference.com/boxscores/201604090CHI.html',
 'http://www.basketball-reference.com/boxscores/201604090MEM.html',
 'http://www.basketball-reference.com/boxscores/201604090NOP.html',
 'http://www.basketball-reference.com/boxscores/201604090POR.html',
 'http://www.basketball-reference.com/boxscores/201604090SAC.html']

#### Make a list of all the responses

In [501]:
responses = [requests.get(game_url) for game_url in game_urls]

## Maybe this class will come in handy later

#### Write a function that outputs the name of the player so we can use it as the filename

In [5]:
def get_name(stats, i):
    name = '_'.join(str(stats[i].get_text().split('\n')[1:-1][0]).split(' '))
    # replace spaces with _ 
    return name

In [13]:
# get_name(stats, 27)

#### Create a function that returns a list with all the values in stats, advanced stats, which two teams are playing, and the date

In [4]:
def get_line(stats, advanced, i, cut, away_team, home_team, dateapp):
    line = stats[i].get_text().split('\n')[1:-1]
    line2 = advanced[i].get_text().split('\n')[3:-1]
    #away_team = soup.select('table.stats_table tbody td a[href]')[0].contents[0].encode('ascii')
    #home_team = soup.select('table.stats_table tbody td a[href]')[1].contents[0].encode('ascii')
    #print away_team, home_team
    line = line + line2
    #am_i_home = stats[i].get_text().split('\n')[1]
    if line[1] == 'Did Not Play' or line[1] == 'Player Suspended':
        line.append(','*32)
        # If a player DNP or PS, add delimiters for all the missing stats
    line.append(away_team)
    line.append(home_team)
    if i < cut:
        line.append('0')
    else:
        line.append('1')
    line.append(dateapp)
    return line

#### Create a function that writes a players stats to a file

In [6]:
def write_to_csv(player_game_stat, out_file):    
    with open(out_file, 'a+') as f:
        f.write(','.join(player_game_stat) + '\n')

In [7]:
def not_box_score(href):
    return href and not re.compile('/boxscores/2').search(href)

#### We need a way to identify which team someone is on. We look at where the second instance of Reserves start and subtract 5 to get index of start of second team

In [8]:
def cutoff(stats):
    count = 0
    for index, stat in enumerate(stats):
        if stat.get_text().split('\n')[1] == 'Reserves':
            count += 1
            if count > 1.5:
                return index-5

In [3]:
base_url = 'http://www.basketball-reference.com'
box_url = 'http://www.basketball-reference.com/boxscores/'
#day_urls = get_url_string_dates((2013, 10, 28),(2014, 4, 17))
#day_urls = get_url_string_dates((2014, 10, 27),(2015, 4, 16))
#day_urls = get_url_string_dates((2015, 10, 26))
day_urls = get_url_string_dates((2016, 4, 4))
#day_urls = get_url_string_dates((2012, 10, 29),(2013, 4, 18))
#day_urls = get_url_string_dates((2011, 12, 24),(2012, 4, 27))
#day_urls = get_url_string_dates((2010, 10, 25),(2011, 4, 14))
#day_urls = get_url_string_dates((2009, 10, 26),(2010, 4, 15))
#day_urls = get_url_string_dates((2008, 10, 27),(2009, 4, 17))
#day_urls = get_url_string_dates((2007, 10, 29),(2008, 4, 17))
#day_urls = get_url_string_dates((2006, 10, 30),(2007, 4, 19))
#day_urls = get_url_string_dates((2005, 10, 31),(2006, 4, 20))
#,(2016, 3, 15))

#### Write a single game from a player into a CSV

In [9]:
# filename = 'data1516/' + get_name(stats) + '.csv'
# create_csv(get_line(stats), filename)

NameError: name 'stats' is not defined

### We need a function that combines our previous functions and starts scraping

In [11]:
def scrappy(day_urls):
    for url, dateapp in zip(day_urls[0], day_urls[1]):
        response = requests.get(url)
        time.sleep(0.6)
        soup = BeautifulSoup(response.content, 'lxml')
        urls = soup.find_all(href=re.compile('/boxscores/2'), string='Final')
        # Make sure we only grab Final and not Box Score as well for duplicate entries
        game_urls = [base_url + url['href'] for url in urls]
        game_stats = [requests.get(game_url) for game_url in game_urls]
        time.sleep(0.6)
        for game_stat in game_stats:
            soup = BeautifulSoup(game_stat.content, 'lxml')
            away_team = soup.select('table.stats_table tbody td a[href]')[0].contents[0].encode('ascii')
            home_team = soup.select('table.stats_table tbody td a[href]')[1].contents[0].encode('ascii')
            #print away_team, home_team
            stats = soup.select('table[id$="_basic"] tbody tr')
            advanced = soup.select('table[id$="_advanced"] tbody tr')
            cut = cutoff(stats)
            for index, stuff in enumerate(stats):
                filename = '../../alldata/data1516/' + get_name(stats, index) + '.csv'
                write_to_csv(get_line(stats, advanced, index, cut, away_team, home_team, dateapp), filename)


In [12]:
scrappy(day_urls)

In [41]:
scrappy(day_urls)

In [None]:
# def initiate_filenames(stats, folder_name):
#     for index, stuff in enumerate(stats):
#         filename = folder_name + get_name(stats, index) + '.csv'
#         write_to_csv(get_line(stats, advanced, index, cut, away_team, home_team, dateapp), filename)

In [None]:
# def create(game_stats):
#     soup = BeautifulSoup(game_stat.content, 'lxml')
#     away_team = soup.select('table.stats_table tbody td a[href]')[0].contents[0].encode('ascii')
#     home_team = soup.select('table.stats_table tbody td a[href]')[1].contents[0].encode('ascii')
#     #print away_team, home_team
#     stats = soup.select('table[id$="_basic"] tbody tr')
#     advanced = soup.select('table[id$="_advanced"] tbody tr')
#     cut = cutoff(stats)
#     for index, stuff in enumerate(stats):
#         filename = folder_name + get_name(stats, index) + '.csv'
#         write_to_csv(get_line(stats, advanced, index, cut, away_team, home_team, dateapp), filename)


In [10]:
def scrappy(day_urls, folder_name, sleep_time = 0):
    for url, dateapp in zip(day_urls[0], day_urls[1]):
        response = requests.get(url)
        time.sleep(sleep_time)
        soup = BeautifulSoup(response.content, 'lxml')
        urls = soup.find_all(href=re.compile('/boxscores/2'), string='Final')
        # Make sure we only grab Final and not Box Score as well for duplicate entries
        game_urls = [base_url + url['href'] for url in urls]
        game_stats = [requests.get(game_url) for game_url in game_urls]
        time.sleep(sleep_time)
        for game_stat in game_stats:
            soup = BeautifulSoup(game_stat.content, 'lxml')
            away_team = soup.select('table.stats_table tbody td a[href]')[0].contents[0].encode('ascii')
            home_team = soup.select('table.stats_table tbody td a[href]')[1].contents[0].encode('ascii')
            #print away_team, home_team
            stats = soup.select('table[id$="_basic"] tbody tr')
            advanced = soup.select('table[id$="_advanced"] tbody tr')
            cut = cutoff(stats)
            for index, stuff in enumerate(stats):
                filename = folder_name + get_name(stats, index) + '.csv'
                write_to_csv(get_line(stats, advanced, index, cut, away_team, home_team, dateapp), filename)


In [None]:
scrappy(day_urls, folder_name = 'data1516/', sleep_time = 0.6)

In [744]:
#response = requests.get(day_urls[2])
#time.sleep(0.3)
#soup = BeautifulSoup(response.content, 'lxml')
#urls = soup.find_all(href=re.compile('/boxscores/2'))
#game_urls = [base_url + url['href'] for url in urls]
#responses = [requests.get(game_url) for game_url in game_urls]
#soup = BeautifulSoup(responses[0].content, 'lxml')
#stats = soup.select('table[id$="_basic"] tbody tr')

In [745]:
#away_team = soup.select('table.stats_table tbody td a[href]')[0].contents[0].encode('ascii')
#home_team = soup.select('table.stats_table tbody td a[href]')[1].contents[0].encode('ascii')
#am_i_home = 
#print type(away_team), home_team