In [1]:
from time import strftime,localtime
import pandas as pd
from os.path import exists
from os import mkdir
from bs4 import BeautifulSoup
import re

In [2]:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By

In [3]:
class Web_scraper:
    def __init__(self):
        self.teams = self.Teams()
        self.players = self.Players()
    
    class Teams:
        def __init__(self):
            self.root = "https://www.espn.com/nba/standings/_"
            self.tail = "/group/league"
            team_abbrvs = pd.read_csv("team_names.csv",usecols=["prefix_1"])
            self.team_names = "|".join(list(team_abbrvs["prefix_1"].str.upper()))
            del team_abbrvs
            
        def build_url(self,year_n,current = True,preseason = False):
            if(current and not preseason):
                return self.root + self.tail
            elif(current and preseason):
                return self.root + "/seasontype/pre" + self.tail
            elif(not current and preseason):
                return self.root + "/seasontype/pre/season/" + year_n + self.tail
            else:
                return self.root + "/season/" + year_n + self.tail
            
        
        def preprocess(self,df, complete_league = True):
            if(complete_league):
                df["Team"] = df["Team"].str.extract(r'(.+--)(.+)', expand = True)[1]

            processed = df["Team"].str.extract(fr'({self.team_names})(.+)', expand = True)

            df.insert(0,"ID",processed[0])
            df["Team"] = processed[1]

            return df
    
        def build_table(self,url, complete_league = True):
            dfs = pd.read_html(url)
            stats = dfs[1] 
            teams = dfs[0]
            teams = teams.rename(columns={0:"Team"})
            table = teams.join(stats)

            return self.preprocess(table,complete_league)
        
    class Players:
        def __init__(self):
            self.REG = "Regular%20Season"
            self.POST = "Playoffs"
            self.root = "https://www.nba.com/stats/players/traditional/?" # + [teamname -prefix_1] +
            self.mid = "Season=" # +[year] YYYY-YY +
            self.tail = "&SeasonType=" # + [Regular%20Season|Playoffs]
            team_abbrvs = pd.read_csv("team_names.csv",usecols=["prefix_1"])
            self.team_names = "|".join(list(team_abbrvs["prefix_1"].str.upper()))
            del team_abbrvs
            
        def build_url(self,year,reg_season = False):
            if(reg_season):
                return self.root + self.mid + year + self.tail + self.REG
            
            return self.root + self.mid + year + self.tail + self.POST
        
        def preprocess(self,df,shooting = True):
            processed = df["Name"].str.extract(r'(.+ | Total)([A-Z]+\**)',expand = True)
            
            processed = processed.drop([len(processed)-1])
            
            if(shooting):
                df.insert(1,"POS",processed[1])
                
            df["Name"] = processed[0].str.strip()
            
            df = df.drop([len(df)-1])

            return df
            
        def build_table(self, url, team):
            dfs = pd.read_html(url)
            
            players = pd.concat([dfs[0],dfs[1]],axis=1)
            shooting = pd.concat([dfs[0],dfs[2]],axis=1)
            
            players = self.preprocess(players,False)
            shooting = self.preprocess(shooting)
            
            table = shooting.join(players.set_index("Name"),on = "Name")
            table["Team"] = team.upper()

            return table

In [4]:
class Player_stats:
    def __init__(self,players):
        self.players = players        
        self.root = "./player_stats"
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/post_season.csv")
        self.xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"
        self.class_name = "stats-table-pagination"
    
    def get_season(self, driver, year,fp, reg_season=True):
        def get_player_and_team_ids(html):
            soup = BeautifulSoup(html)
            table = soup.find("table")
            pids = []
            tids = []
            player_reg = '/player/\d+'
            team_reg = '/team/\d+'

            for l in table.find_all('a'):
                player_match = re.search(player_reg,l.get('href'))

                team_match = re.search(team_reg,l.get('href'))

                if(player_match):
                    player_id = int(re.search('\d+',player_match.group()).group())
                    pids.append(player_id)
                elif(team_match):
                    team_id = int(re.search('\d+',team_match.group()).group())
                    tids.append(team_id)
                    
            return pids, tids
        
        url = self.players.build_url(year,reg_season)
        print(url)
        
        while(1<2):
            try:
                driver.get(url)
                s = driver.find_element(By.CLASS_NAME,self.class_name)
                s = driver.find_element(By.XPATH,self.xpath)
                s = Select(s)
                s.select_by_visible_text("All")

                break
            except:
                pass

        html = driver.page_source
        pids, tids = get_player_and_team_ids(html)
        
        df = pd.read_html(html)[0]
        df["PLAYER_ID"] = pids
        df["TEAM_ID"] = tids
        
        df = df.dropna('columns')
        df.to_csv(fp)
        
        return
        
       
    def get_player_stats(self,last_n_years):
        def get_last_year(year):
            if(year > 2000):
                return year % 2000
            return year % 100
        
        year = int(strftime("%Y",localtime()))-1
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                
                year_range = str(y-1) + "-{:0>2d}".format(get_last_year(y))

                file_dir = self.root + "/" + year_range

                if(not exists(file_dir)):# there is no data on this year, get it
                    mkdir(file_dir)

                    fp = file_dir + self.labels[self.REG]
                    self.get_season(driver,year_range,fp)


                    fp = file_dir + self.labels[self.POST]
                    self.get_season(driver,year_range,fp,reg_season=False)

        return

In [5]:
wbs = Web_scraper()

In [6]:
ps = Player_stats(wbs.players)

In [7]:
ps.get_player_stats(22)

https://www.nba.com/stats/players/traditional/?Season=2015-16&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=2015-16&SeasonType=Playoffs
https://www.nba.com/stats/players/traditional/?Season=2014-15&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=2014-15&SeasonType=Playoffs
https://www.nba.com/stats/players/traditional/?Season=2013-14&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=2013-14&SeasonType=Playoffs
https://www.nba.com/stats/players/traditional/?Season=2012-13&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=2012-13&SeasonType=Playoffs
https://www.nba.com/stats/players/traditional/?Season=2011-12&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=2011-12&SeasonType=Playoffs
https://www.nba.com/stats/players/traditional/?Season=2010-11&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Sea

In [None]:
class Team_standings:
    
    def __init__(self, teams):
        self.REG = 0
        self.PRE = 1
        self.labels =("regular_season", "pre_season")
        self.teams = teams
        self.reg_fp = "./team_standings/reg_season/"
        self.pre_fp = "./team_standings/pre_season/"


    def gen_key(self, year, season_type):
        return year+"_"+season_type

    
    def save_standings(self,filepath,year,preseason=False):
        if(exists(filepath)):
            return
        
        current = False
        season = self.teams.build_url(year,current, preseason)

        if(preseason):
            table = self.teams.build_table(season,False)
        else:
            table = self.teams.build_table(season)
            
        table.to_csv(filepath,index = False)
        
        return

    def update_team_standings(self):
        this_year = strftime("%Y",localtime())

        filename = self.gen_key(this_year,self.labels[self.REG]) + ".csv"
        filepath = self.reg_fp + filename
        
        reg_season = self.teams.build_url(this_year)
        table = self.teams.build_table(reg_season,False)
        table.to_csv(filepath,index = False)
        
        
        filename = self.gen_key(this_year,self.labels[self.PRE]) + ".csv"
        filepath =  self.pre_fp + filename
        
        pre_season = self.teams.build_url(this_year,preseason = True)
        table = self.teams.build_table(pre_season,False)
        table.to_csv(filepath,index = False)
    
        return
        
        
    
    def get_team_standings(self,last_n_years):
        this_year = strftime("%Y",localtime())
        year = int(this_year)

        for y in range(year,year-last_n_years,-1):
            filename = self.gen_key(str(y-1),self.labels[self.REG]) + ".csv"
            filepath = self.reg_fp + filename
            self.save_standings(filepath,str(y-1))
            
            
            filename = self.gen_key(str(y-1),self.labels[self.PRE]) + ".csv"
            filepath = self.pre_fp + filename
            self.save_standings(filepath,str(y-1),True)
        
        return