In [1]:
from time import strftime,localtime
import pandas as pd
from os.path import exists
from os import mkdir
from bs4 import BeautifulSoup
import re

In [2]:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By

In [3]:
class Web_scraper:
    def __init__(self):
        self.teams = self.Teams()
        self.players = self.Players()
    
    class Teams:
        def __init__(self):
            self.root = "https://www.nba.com/stats/teams/traditional/?"
            self.REG = "Regular%20Season"
            self.POST = "Playoffs"
            self.mid = "Season=" # +[year] YYYY-YY +
            self.tail = "&SeasonType=" # + [Regular%20Season|Playoffs]
#             self.xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table"

            
            
        def build_url(self,year,reg_season = False):
            if(reg_season):
                return self.root + self.mid + year + self.tail + self.REG
            
            return self.root + self.mid + year + self.tail + self.POST
        
        def get_team_ids(self,html):
            soup = BeautifulSoup(html,'html.parser')
            table = soup.find("table")
            
            tids = []
            team_reg = '/team/\d+'

            for l in table.find_all('a'):
                team_match = re.search(team_reg,l.get('href'))

                if(team_match):
                    team_id = int(re.search('\d+',team_match.group()).group())
                    tids.append(team_id)
                    
            return tids
        
        def get_source_and_teams(self, url, driver):
            while(1<2):
                try:
                    driver.get(url)
                    html = driver.page_source
                    return html, self.get_team_ids(html)
                except:
                    pass
    
        
        
    class Players:
        def __init__(self):
            self.REG = "Regular%20Season"
            self.POST = "Playoffs"
            self.root = "https://www.nba.com/stats/players/traditional/?" # + [teamname -prefix_1] +
            self.mid = "Season=" # +[year] YYYY-YY +
            self.tail = "&SeasonType=" # + [Regular%20Season|Playoffs]
            self.xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"
            
        def build_url(self,year,reg_season = False):
            if(reg_season):
                return self.root + self.mid + year + self.tail + self.REG
            
            return self.root + self.mid + year + self.tail + self.POST
        
        def click_all(self, url, driver):
            driver.get(url)
            s = driver.find_element(By.XPATH,self.xpath)
            s = Select(s)
            s.select_by_visible_text("All")

            return driver.page_source
            
        
        def get_player_and_team_ids(self,html):

            soup = BeautifulSoup(html, 'html.parser')
            table = soup.find("table")
            
            pids = []
            tids = []
            
            player_reg = '/player/\d+'
            team_reg = '/team/\d+'

            for l in table.find_all('a'):
                player_match = re.search(player_reg,l.get('href'))

                team_match = re.search(team_reg,l.get('href'))

                if(player_match):
                    player_id = int(re.search('\d+',player_match.group()).group())
                    pids.append(player_id)
                elif(team_match):
                    team_id = int(re.search('\d+',team_match.group()).group())
                    tids.append(team_id)
                    
            return pids, tids

In [4]:
class Team_standings:
    
    def __init__(self, teams):
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/playoffs.csv")
        self.teams = teams
        self.root = "../team_standings/"
        
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100


    
    def write(self,html,tids,fp):
        df = pd.read_html(html)[0]
        df["TEAM_ID"] = tids
        
        df = df.dropna('columns')
        df.to_csv(fp,index = False)
        return 

    def get_season(self, driver, year,fp, reg_season=True):
        
        url = self.teams.build_url(year,reg_season)
        
        print(url)
        
        html,tids = self.teams.get_source_and_teams(url, driver)
        
        self.write(html,tids,fp)
        
        return
    
    def get_team_standings(self,last_n_years):
        this_year = strftime("%Y",localtime())
        year = int(this_year) -  1
        
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                year_range = str(y-1) + "-{:0>2d}".format(self.get_last_year(y))
                
                file_dir = self.root + year_range
                if(not exists(file_dir)):# there is no data on this year, get it
                    mkdir(file_dir)
                
                    fp = file_dir + self.labels[self.REG]
                    self.get_season(driver,year_range,fp)

                    fp = file_dir + self.labels[self.POST]
                    self.get_season(driver,year_range,fp,reg_season=False)

        
        return

In [5]:
class Player_stats:
    def __init__(self,players):
        self.players = players        
        self.root = "../player_stats"
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/playoffs.csv")
        
        
    def write(self,html,pids,tids,fp):
        df = pd.read_html(html)[0]
        df["PLAYER_ID"] = pids
        df["TEAM_ID"] = tids
        
        df = df.dropna('columns')
        df.to_csv(fp,index = False)
        return 
    
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100
        
    def get_season(self, driver, year,fp, reg_season=True):
        
        url = self.players.build_url(year,reg_season)
        
        print(url)
        while(1<2):
            try:
                html = self.players.click_all(url, driver)
                pids, tids = self.players.get_player_and_team_ids(html)
                break
            except:
                print("Error. Trying again.")
        
        self.write(html,pids,tids,fp)
        
        return
        
       
    def get_player_stats(self,last_n_years):
        year = int(strftime("%Y",localtime()))-1
        
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                year_range = str(y-1) + "-{:0>2d}".format(self.get_last_year(y))

                file_dir = self.root + "/" + year_range

                if(not exists(file_dir)):# there is no data on this year, get it
                    mkdir(file_dir)

                    fp = file_dir + self.labels[self.REG]
                    self.get_season(driver,year_range,fp)


                    fp = file_dir + self.labels[self.POST]
                    self.get_season(driver,year_range,fp,reg_season=False)

        return

In [6]:
wbs = Web_scraper()

In [7]:
ts = Team_standings(wbs.teams)

In [8]:
ts.get_team_standings(22)

In [9]:
ps = Player_stats(wbs.players)

In [11]:
ps.get_player_stats(23)

https://www.nba.com/stats/players/traditional/?Season=1997-98&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=1997-98&SeasonType=Playoffs
