In [1]:
from time import strftime,localtime
import pandas as pd
from os.path import exists
from os import mkdir

In [2]:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [3]:
from sqlalchemy import create_engine
from os import environ
from re import search

In [4]:
class Web_scraper:
    def __init__(self):
        self.teams = self.Teams()
        self.players = self.Players()
        self.boxes = self.Box_scores()
    
    class Teams:
        def __init__(self):
            self.root = "https://www.nba.com/stats/teams/traditional/?"
            self.REG = "Regular%20Season"
            self.POST = "Playoffs"
            self.mid = "Season=" # +[year] YYYY-YY +
            self.tail = "&SeasonType=" # + [Regular%20Season|Playoffs]
#             self.xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table"

            
            
        def build_url(self,year,reg_season = False):
            if(reg_season):
                return self.root + self.mid + year + self.tail + self.REG
            
            return self.root + self.mid + year + self.tail + self.POST
        
        def get_team_ids(self,html):
            soup = BeautifulSoup(html,'html.parser')
            table = soup.find("table")
            
            tids = []
            team_reg = '/team/\d+'

            for l in table.find_all('a'):
                team_match = search(team_reg,l.get('href'))

                if(team_match):
                    team_id = int(search('\d+',team_match.group()).group())
                    tids.append(team_id)
                    
            return tids
        
        def get_source_and_teams(self, url, driver):
            while(1<2):
                try:
                    driver.get(url)
                    html = driver.page_source
                    return html, self.get_team_ids(html)
                except:
                    pass
    
        
        
    class Players:
        def __init__(self):
            self.REG = "Regular%20Season"
            self.POST = "Playoffs"
            self.root = "https://www.nba.com/stats/players/traditional/?" # + [teamname -prefix_1] +
            self.mid = "Season=" # +[year] YYYY-YY +
            self.tail = "&SeasonType=" # + [Regular%20Season|Playoffs]
            self.xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"
            
        def build_url(self,year,reg_season = False):
            if(reg_season):
                return self.root + self.mid + year + self.tail + self.REG
            
            return self.root + self.mid + year + self.tail + self.POST
        
        def click_all(self, url, driver):
            driver.get(url)
            s = driver.find_element(By.XPATH,self.xpath)
            s = Select(s)
            s.select_by_visible_text("All")

            return driver.page_source
            
        
        def get_player_and_team_ids(self,html):

            soup = BeautifulSoup(html, 'html.parser')
            table = soup.find("table")
            
            pids = []
            tids = []
            
            player_reg = '/player/\d+'
            team_reg = '/team/\d+'

            for l in table.find_all('a'):
                player_match = search(player_reg,l.get('href'))

                team_match = search(team_reg,l.get('href'))

                if(player_match):
                    player_id = int(search('\d+',player_match.group()).group())
                    pids.append(player_id)
                elif(team_match):
                    team_id = int(search('\d+',team_match.group()).group())
                    tids.append(team_id)
                    
            return pids, tids
        
        
    class Box_scores:
        def __init__(self):
            self.REG = "Regular%20Season"
            self.POST = "Playoffs"
            self.root = "https://www.nba.com/stats/players/boxscores/?" # + [teamname -prefix_1] +
            self.mid = "Season=" # +[year] YYYY-YY +
            self.tail = "&SeasonType=" # + [Regular%20Season|Playoffs]
            self.xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table"
            self.select_xpath = "/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"
            
        def build_url(self,year,reg_season = False):
            if(reg_season):
                return self.root + self.mid + year + self.tail + self.REG
            
            return self.root + self.mid + year + self.tail + self.POST
        
        
        def iter_all(self, url, driver):
            wait = 10
            driver.get(url)
            element = WebDriverWait(driver,wait).until(EC.presence_of_element_located((By.XPATH,self.xpath)))
            s = driver.find_element(By.XPATH,self.select_xpath)
            t = s.text
            t = t.split("\n")
            
            d = pd.read_csv("../box_score.csv")
            difference = len(t) - d["Pages"][0]+1
            print("Difference ",difference)
            
            for i in t[1:difference]:
                s = driver.find_element(By.XPATH,self.select_xpath)
                s = Select(s)
                s.select_by_visible_text(i)
                yield driver.page_source
            
            d["Pages"][0] = int(t[-1])
            d.to_csv("../box_score.csv",index = False)
            return
            
        
        def get_player_and_team_ids(self,html):

            soup = BeautifulSoup(html, 'html.parser')
            table = soup.find("table")
            
            pids = []
            tids = []
            gids = []
            
            player_reg = '/player/\d+'
            team_reg = '/team/\d+'
            game_reg = '/game/\d+'

            for l in table.find_all('a'):
                player_match = search(player_reg,l.get('href'))

                team_match = search(team_reg,l.get('href'))
                
                game_match = search(game_reg,l.get('href'))

                if(player_match):
                    player_id = int(search('\d+',player_match.group()).group())
                    pids.append(player_id)
                elif(team_match):
                    team_id = int(search('\d+',team_match.group()).group())
                    tids.append(team_id)
                elif(game_match):
                    game_id = search('\d+',game_match.group()).group()
                    gids.append(game_id)
                    
            return pids, tids, gids

In [5]:
def add_new_players(engine, names, ids):
    condition = str(tuple(ids))
    existing_players = pd.read_sql("select * from Players where ID IN "+condition,engine)
        
    c = pd.DataFrame({"ID":ids,"Name":names}).drop_duplicates()
    e = c.merge(existing_players,on="ID",how='left',indicator=True,suffixes = ('','_y'))
    e = e.loc[e["_merge"]== "left_only"][["ID","Name"]]
        
    e.to_sql("Players",engine,index=False, if_exists="append")
    
    return

In [6]:
def add_new_box_scores(engine,df):
    existing = pd.read_sql("select * from Box_scores",engine)
    
    e = df.merge(existing,on=["Player_ID","Team_ID","Game_ID"],how='left',indicator=True,suffixes = ('','_y'))
    e = e.loc[e["_merge"]== "left_only"][df.columns]
    
    e.to_sql("Box_scores",engine,index=False, if_exists="append")
    
    return

In [7]:
class Box:
    def __init__(self,boxes):
        self.boxes = boxes        
        self.table = "Box_scores"
        self.scores =[]
        self.page_count = pd.read_csv("../box_score.csv")
        self.engine = create_engine("mariadb+mariadbconnector://"\
                                  +environ.get("USER")+":"\
                                  +environ.get("PSWD")+"@127.0.0.1:3306/nba")
        
        self.db_columns = ['Player_ID', 'Team_ID', 'Game_ID', 'Matchup', 'Game_day', 'Result',
                        'MINS', 'PTS', 'FGM', 'FGA', 'FGP', 'PM3', 'PA3', 'P3P', 'FTM', 'FTA',
                        'FTP', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',  'PF']

        
    def write(self,html,pids,tids,gids):
        df = pd.read_html(html,na_values=['-'])[0]
        self.scores = df
        df = df.drop(columns = ['Season','+/-','FP'])
        
        add_new_players(self.engine, df[df.columns[0]],pids)
        df = df[df.columns[2:]]
        
        d = dict(zip(df.columns,self.db_columns[3:]))
        df = df.rename(columns=d)
        
        df.insert(0, "Player_ID",pids)
        df.insert(1, "Team_ID",tids)
        df.insert(2, "Game_ID",gids)
        
        
        df['Game_day'] = pd.to_datetime(df['Game_day'])
        
        df = df[self.db_columns]
        df = df.drop_duplicates()

        add_new_box_scores(self.engine,df)
        
        return 
    
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100
        
    def get_season(self, driver, year, reg_season=True):
        
        url = self.boxes.build_url(year,reg_season)
        
#         box_key = int(year.split("-")[0])+1
#         pages = 
        for html in self.boxes.iter_all(url, driver):
            pids, tids, gids = self.boxes.get_player_and_team_ids(html)
            self.write(html,pids,tids,gids)
        
        return
        
       
    def get_player_stats(self,year,reg_season = True):
        with webdriver.Chrome() as driver:
            year_range = str(year-1) + "-{:0>2d}".format(self.get_last_year(year))

            self.get_season(driver,year_range,reg_season)

        return

In [8]:
wbs = Web_scraper()

In [9]:
bs = Box(wbs.boxes)

In [10]:
bs.boxes.mid

'Season='

In [11]:
bs.get_player_stats(2022)

Difference  4


In [16]:
d= pd.read_csv("../box_score.csv")

In [5]:
class Team_standings:
    
    def __init__(self, teams):
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/playoffs.csv")
        self.teams = teams
        self.root = "../team_standings/"
        
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100


    
    def write(self,html,tids,fp):
        df = pd.read_html(html)[0]
        df["TEAM_ID"] = tids
        
        df = df.dropna('columns')
        df.to_csv(fp,index = False)
        return 

    def get_season(self, driver, year,fp, reg_season=True):
        
        url = self.teams.build_url(year,reg_season)
        
        print(url)
        
        html,tids = self.teams.get_source_and_teams(url, driver)
        
        self.write(html,tids,fp)
        
        return
    
    def get_team_standings(self,last_n_years):
        this_year = strftime("%Y",localtime())
        year = int(this_year) -  1
        
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                year_range = str(y-1) + "-{:0>2d}".format(self.get_last_year(y))
                
                file_dir = self.root + year_range
                if(not exists(file_dir)):# there is no data on this year, get it
                    mkdir(file_dir)
                
                    fp = file_dir + self.labels[self.REG]
                    self.get_season(driver,year_range,fp)

                    fp = file_dir + self.labels[self.POST]
                    self.get_season(driver,year_range,fp,reg_season=False)

        
        return

In [6]:
class Player_stats:
    def __init__(self,players):
        self.players = players        
        self.root = "../player_stats"
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/playoffs.csv")
        
        
    def write(self,html,pids,tids,fp):
        df = pd.read_html(html)[0]
        df["PLAYER_ID"] = pids
        df["TEAM_ID"] = tids
        
        df = df.dropna('columns')
        df.to_csv(fp,index = False)
        return 
    
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100
        
    def get_season(self, driver, year,fp, reg_season=True):
        
        url = self.players.build_url(year,reg_season)
        
        print(url)
        while(1<2):
            try:
                html = self.players.click_all(url, driver)
                pids, tids = self.players.get_player_and_team_ids(html)
                break
            except:
                print("Error. Trying again.")
        
        self.write(html,pids,tids,fp)
        
        return
        
       
    def get_player_stats(self,last_n_years):
        year = int(strftime("%Y",localtime()))-1
        
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                year_range = str(y-1) + "-{:0>2d}".format(self.get_last_year(y))

                file_dir = self.root + "/" + year_range

                if(not exists(file_dir)):# there is no data on this year, get it
                    mkdir(file_dir)

                    fp = file_dir + self.labels[self.REG]
                    self.get_season(driver,year_range,fp)


                    fp = file_dir + self.labels[self.POST]
                    self.get_season(driver,year_range,fp,reg_season=False)

        return

In [19]:
ts = Team_standings(wbs.teams)

In [8]:
ts.get_team_standings(22)

In [9]:
ps = Player_stats(wbs.players)

In [11]:
ps.get_player_stats(23)

https://www.nba.com/stats/players/traditional/?Season=1997-98&SeasonType=Regular%20Season
https://www.nba.com/stats/players/traditional/?Season=1997-98&SeasonType=Playoffs
