## Packages

In [23]:
import pixiedust
import requests, pandas as pd, numpy as np
from datetime import datetime, timedelta
import time
from bs4 import BeautifulSoup

## Scraping

In [61]:
class boxScore:
    # Date 
    def __init__(self, year = None, month = None, day = None):
        '''
        Input year, month, day as strings
        '''
        Someday = datetime.today() - timedelta(days = 200)
        if year is None or month is None or day is None:
            self.year = str(Someday.year)
            self.month = "0" + str(Someday.month) if Someday.month < 10 else str(Someday.month)
            self.sday = str(Someday.day)
        else:
            self.year = str(year)
            self.month = "0" + str(month) if month < 10 else str(month)
            self.day = "0" + str(day) if day < 10 else str(day)
        self.my_date = self.year + "-" + self.month + "-" + self.day
        
        # 避免阻擋爬蟲
        self.header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
        }
    
    def PlayorNot(self):
        # Check if there is no NBA game on the day?
        
        
        url_games_played = "https://www.basketball-reference.com/boxscores/?month=" + self.month + "&day=" + self.day + "&year=" + self.year
        page_games_played = requests.get(url_games_played, headers = self.header)
        soup_games_played = BeautifulSoup(page_games_played.text, "lxml")
        if soup_games_played.select("strong")[1].text == "No games played on this date.":
            return 0
        else:
            return 1
            
    def teamNameDictLong(self):
        '''
        This function is used to get the abbreviated team name on the website so that I can transform the full name on the game result.
        Then, the abbreviation will help to get the url of box score.
        '''

        #Extract abbreviation of the team name from the website
        month_dict = {"10": "october",
                    "11": "november",
                    "12": "december",
                    "01": "january",
                    "02": "february",
                    "03": "march",
                    "04": "april",
                    "05": "may",
                    "06": "june"}
        
        url_first = "https://www.basketball-reference.com/leagues/NBA_" + year + "_games-" + month_dict[month] +".html"
        first_page = requests.get(url_first, headers = self.header)
        soup_first = BeautifulSoup(first_page.text, 'lxml')
        year_on_the_first_page = soup_first.select("#teams > h3")[0].text
        year_on_the_first_page = ''.join(list(map(year_on_the_first_page.__getitem__, [0, 1, 5, 6])))
        
        east_region = soup_first.select("#teams > div.data_grid.section_wrapper")[0].find("tbody").find_all("th")
        east_team = [re.search(r"[A-Z]*", team.text).group(0) for team in east_region]

        west_region = soup_first.select("#confs_standings_W")[0].find("tbody").find_all("th")
        west_team = [re.search(r"[A-Z]*", team.text).group(0) for team in west_region]
        abbr_team = east_team + west_team

        #Extract the full team name from the website
        url_full = "https://www.basketball-reference.com/leagues/NBA_" + year_on_the_first_page + ".html"
        page_full = requests.get(url_first, headers = self.header)
        soup_full = BeautifulSoup(page_full.text, 'lxml')

        east_team_full = [i.find_all('a')[0].get("title") for i in 
                                 soup_full.select("#confs_standings_E > tbody")[0].find_all("th")]
        
        west_team_full = [i.find_all('a')[0].get("title") for i in 
                                 soup_full.select("#confs_standings_W > tbody")[0].find_all("th")]
                
        full_team = east_team_full + west_team_full

        return dict(zip(full_team, abbr_team))

    def teamNameDictShort(self):
        '''
        This function is used to get the abbreviated team name on the website so that I can transform the full name on the game result.
        Then, the abbreviation will help to get the url of box score.
        '''

        #Extract abbreviation of the team name from the website
        url_first = "https://www.basketball-reference.com"
        first_page = requests.get(url_first, headers = self.header)
        soup_first = BeautifulSoup(first_page.text, 'lxml')
        year_on_the_first_page = soup_first.select("#teams > h3")[0].text
        year_on_the_first_page = ''.join(list(map(year_on_the_first_page.__getitem__, [0, 1, 5, 6])))
        
        east_region = soup_first.select("#teams > div.data_grid.section_wrapper")[0].find("tbody").find_all("th")
        east_team = [re.search(r"[A-Z]*", team.text).group(0) for team in east_region]

        west_region = soup_first.select("#confs_standings_W")[0].find("tbody").find_all("th")
        west_team = [re.search(r"[A-Z]*", team.text).group(0) for team in west_region]
        abbr_team = east_team + west_team

        #Extract the full team name from the website
        url_full = "https://www.basketball-reference.com/leagues/NBA_" + year_on_the_first_page + ".html"
        page_full = requests.get(url_first, headers = self.header)
        soup_full = BeautifulSoup(page_full.text, 'lxml')

        east_team_full = [' '.join(i.find_all('a')[0].get("title").split(' ')[:-1]) for i in 
                                 soup_full.select("#confs_standings_E > tbody")[0].find_all("th")]
        
        west_team_full = [i.find_all('a')[0].get("title") for i in 
                                 soup_full.select("#confs_standings_W > tbody")[0].find_all("th")]
        
        # Special team name in west teams
        for i in range(len(west_team)):
            if west_team_full[i] ==  "Portland Trail Blazers":
                west_team_full[i] = "Portland "
            elif west_team_full[i] ==  "Los Angeles Clippers":
                west_team_full[i] = "LA Clippers "
            elif west_team_full[i] ==  "Los Angeles Lakers":
                west_team_full[i] = "LA Lakers "
        west_team_full = [' '.join(i.split(' ')[:-1]) for i in west_team_full]
        
        full_team = east_team_full + west_team_full

        return dict(zip(full_team, abbr_team))
    
    def teamNameDictEasy(self):
        return {'Philadelphia': 'PHI',
         'Toronto': 'TOR',
         'Miami': 'MIA',
         'Boston': 'BOS',
         'Cleveland': 'CLE',
         'Orlando': 'ORL',
         'Milwaukee': 'MIL',
         'Charlotte': 'CHO',
         #'Charlotte': 'CHA', #注意年份
         'Atlanta': 'ATL',
         'Detroit': 'DET',
         'Washington': 'WAS',
         'Brooklyn': 'BRK',
         'Indiana': 'IND',
         'Chicago': 'CHI',
         'New York': 'NYK',
         'Utah': 'UTA',
         'Houston': 'HOU',
         'Dallas': 'DAL',
         'San Antonio': 'SAS',
         'Minnesota': 'MIN',
         'LA Lakers': 'LAL',
         'LA Clippers': 'LAC',
         'Phoenix': 'PHO',
         'Portland': 'POR',
         'Denver': 'DEN',
         'Golden State': 'GSW',
         'Memphis': 'MEM',
         'New Orleans': 'NOP',
         #'New Orleans': 'NOH', # 注意年份
         'Oklahoma City': 'OKC',
         'Sacramento': 'SAC',
         'Seattle': 'SEA',
         'New Jersey': 'NJN',
         }
    
    # How many teams did play on the day?
    def numberofTeamPlayedPerDay(self):
        
        url_games_played = "https://www.basketball-reference.com/boxscores/?month=" + str(self.month) + "&day=" + str(self.day) + "&year=" + str(self.year)
        page_games_played = requests.get(url_games_played, headers = self.header)
        soup_games_played = BeautifulSoup(page_games_played.text, "lxml")
        
        return int(soup_games_played.select("#content > div.section_heading > h2")[0].text.split(" ")[0])
    
    # Teams which played today
    def whichTeamsPlayedPerDay(self):
        ''' 
        This function is used to know what teams and how many teams played on a specific day. 
        It will return a list of team titles as abbreviation.
        '''
        url_games_played = "https://www.basketball-reference.com/boxscores/?month=" + str(self.month) + "&day=" + str(self.day) + "&year=" + str(self.year)

        page_games_played = requests.get(url_games_played, headers = self.header)
        soup_games_played = BeautifulSoup(page_games_played.text, "lxml")

        team_one = []
        team_two = []
        for table in soup_games_played.find("div", {"class": "game_summaries"}).find_all("table", {"class": "teams"}):
            team_one.append(table.find_all("tr")[0].find("a").text)
            team_two.append(table.find_all("tr")[1].find("a").text)

        for i in range(len(team_two)):
            team_one[i] = self.teamNameDictEasy()[team_one[i]]
            team_two[i] = self.teamNameDictEasy()[team_two[i]]

        return team_one, team_two
        
    def extractTablesPerGame(self, index_of_game):
        
        team_one = self.whichTeamsPlayedPerDay()[0][index_of_game]
        team_two = self.whichTeamsPlayedPerDay()[1][index_of_game]
        url_box = "https://www.basketball-reference.com/boxscores/" + str(self.year) + str(self.month) + str(self.day) + "0" + team_two + ".html"
        date = datetime.strptime("-".join([self.year, self.month, self.day]), "%Y-%m-%d").date()

        # Preparing to scrape
        web = requests.get(url_box, headers = self.header)
        soup = BeautifulSoup(web.text, "lxml")


        box_score = {"Home": [], "Away": []}

        for tab in [team_one, team_two]:
            
            table_basic = soup.select("#box-" + tab + "-game-basic")[0]
            table_adv = soup.select("#box-" + tab + "-game-advanced")[0]
            
            #Headers
            headers =  [h.text for h in table_basic.find_all("tr")[1].find_all("th")]
            headers[0] = "Players"
            headers_advanced =  [h.text for h in table_adv.find_all("tr")[1].find_all("th")]
            headers_advanced[0] = "Players"

            # initializing empty list to store the statistics
            empty_list_basic = []
            empty_list_adv = []
            
            # Number of players in the table
            num_of_players_played_basic = table_basic.find_all("td", {"data-stat": "mp"}).__len__() - 1
            num_of_players_played_adv = table_adv.find_all("td", {"data-stat": "mp"}).__len__() - 1
            
            # Getting all the statistics
            for player in list(range(0, 5)) + list(range(6, num_of_players_played_basic+1)):
                empty_list_basic.append([ele.text for ele in table_basic.find("tbody").find_all("tr")[player]])

            for player in list(range(0, 5)) + list(range(6, num_of_players_played_adv+1)):
                empty_list_adv.append([ele.text for ele in table_adv.find("tbody").find_all("tr")[player]])

            if tab == team_one:
                box_score["Away"] = pd.concat([pd.DataFrame(empty_list_basic, columns = headers),
                                                             pd.DataFrame(empty_list_adv, columns = headers_advanced).iloc[:, 2:]],
                                                             axis = 1)
                box_score["Away"].insert(0, "Team", [team_one] * num_of_players_played_basic) # Add team name
                box_score["Away"].insert(0, "Home_Away", ["Away"] * num_of_players_played_basic) # Add Home or Away
            else:
                box_score["Home"] = pd.concat([pd.DataFrame(empty_list_basic, columns = headers),
                                                               pd.DataFrame(empty_list_adv, columns = headers_advanced).iloc[:, 2:]],
                                                               axis = 1)
                box_score["Home"].insert(0, "Team", [team_two] * num_of_players_played_basic) 
                box_score["Home"].insert(0, "Home_Away", ["Home"] * num_of_players_played_basic)


        # Merge the box scores of the home team and the away team
        total_box_score = pd.concat([box_score["Home"], box_score["Away"]], sort = False)
        total_box_score.insert(0, "Date", [self.my_date] * len(total_box_score)) # Add Date
        total_box_score.insert(1, "Game_Index", [index_of_game + 1] * len(total_box_score)) # Add game index
        total_box_score = total_box_score.applymap(lambda x: np.NaN if x == "" else x) # missing value 
        total_box_score.iloc[:, 6:40] = total_box_score.iloc[:, 6:40].applymap(lambda x: np.nan if x == np.nan else float(x)) # data types
        total_box_score.reset_index(inplace = True, drop = True)
        
        return total_box_score
    
    def extractAllTablesPerDay(self):
        '''
        If there is no index of the game, I will scrape all the game played on that day.
        If there is a specific index of the game, I will only scrape that game.
        
        The function returns two organized table as one basic and one advanced box score table
        '''
        # Information of the game

        # I will save all the games played on that day in the dictionay
        all_box_score = []
        
        number_of_games = self.numberofTeamPlayedPerDay()
        for ind in range(number_of_games):
            all_box_score.append(self.extractTablesPerGame(ind))
            
        
        day_box_score = pd.concat(all_box_score, sort = False)
        day_box_score.reset_index(drop = True, inplace = True)
        
        return day_box_score

## Implementing

In [3]:
miss_date = []

In [67]:
if __name__ == "__main__":
    # Input the dates you want to search for 
    #start = input("Enter the first day (yyyy mm dd): ").replace(" ", "-")
    #end = input("Enter the last day (yyyy mm dd): ").replace(" ", "-")
    
    # Transform the format
    start = datetime.strptime("2014-10-1", "%Y-%m-%d")
    end = datetime.strptime("2015-6-30", "%Y-%m-%d")
    date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days + 1) if (start + timedelta(days=x)).month not in (7, 8, 9)]    
    
    
    whole_table = []
    for day in range(len(date_generated)):
        try:
            initialBoxScore = boxScore(date_generated[day].year,
                                                   date_generated[day].month,
                                                    date_generated[day].day)
            time.sleep(np.random.randint(3, 7))

            if initialBoxScore.PlayorNot() == 0:
                continue

            whole_table.append(initialBoxScore.extractAllTablesPerDay())
        
        except:
            print(date_generated[day])
            miss_date.append(date_generated[day])
        
    if whole_table == []:
        print("No NBA games")
    else:
        final_table = pd.concat(whole_table, sort = False)
        final_table.reset_index(drop = True, inplace = True)
                    


In [70]:
#final_table.to_csv("table_14.csv")