In [11]:
import pandas as pd
from time import strftime,localtime
from sqlalchemy import create_engine
from os import environ
from os.path import exists
from os import mkdir

In [35]:
from sys import path

In [36]:
path.insert(0,'./code/')

In [37]:
from web_scraper import *

In [38]:
from selenium import webdriver

In [16]:
def add_new_players(engine, names, ids):
    condition = str(tuple(ids))
    existing_players = pd.read_sql("select * from Players where ID IN "+condition,engine)
        
    c = pd.DataFrame({"ID":ids,"Name":names}).drop_duplicates()
    e = c.merge(existing_players,on="ID",how='left',indicator=True,suffixes = ('','_y'))
    e = e.loc[e["_merge"]== "left_only"][["ID","Name"]]
        
    e.to_sql("Players",engine,index=False, if_exists="append")
    
    return

In [17]:
class Box:
    def __init__(self):
        self.boxes = Box_scores()        
        self.table = "Box_scores"
        self.scores =[]
        self.page_count = pd.read_csv("../box_score.csv")
        self.engine = create_engine("mariadb+mariadbconnector://"\
                                  +environ.get("USER")+":"\
                                  +environ.get("PSWD")+"@127.0.0.1:3306/nba")
        
        self.db_columns = ['Player_ID', 'Team_ID', 'Game_ID', 'Matchup', 'Game_day', 'Result',
                        'MINS', 'PTS', 'FGM', 'FGA', 'FGP', 'PM3', 'PA3', 'P3P', 'FTM', 'FTA',
                        'FTP', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',  'PF']
        
    def add_new_box_scores(self,df):
        existing = pd.read_sql("select * from Box_scores",self.engine)

        e = df.merge(existing,on=["Player_ID","Team_ID","Game_ID"],how='left',indicator=True,suffixes = ('','_y'))
        e = e.loc[e["_merge"]== "left_only"][df.columns]

        e.to_sql("Box_scores",self.engine,index=False, if_exists="append")

        return
        
    def write(self,html,pids,tids,gids):
        df = pd.read_html(html,na_values=['-'])[0]
        self.scores = df
        df = df.drop(columns = ['Season','+/-','FP'])
        
        add_new_players(self.engine, df[df.columns[0]],pids)
        df = df[df.columns[2:]]
        
        d = dict(zip(df.columns,self.db_columns[3:]))
        df = df.rename(columns=d)
        
        df.insert(0, "Player_ID",pids)
        df.insert(1, "Team_ID",tids)
        df.insert(2, "Game_ID",gids)
        
        
        df['Game_day'] = pd.to_datetime(df['Game_day'])
        
        df = df[self.db_columns]
        df = df.drop_duplicates()

        self.add_new_box_scores(df)
        
        return 
    
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100
        
    def get_season(self, driver, year, reg_season=True):
        
        url = self.boxes.build_url(year,reg_season)
        
        for html in self.boxes.iter_all(url, driver):
            pids, tids, gids = self.boxes.get_player_and_team_ids(html)
            self.write(html,pids,tids,gids)
        
        return
        
       
    def get_player_stats(self,year,reg_season = True):
        with webdriver.Chrome() as driver:
            year_range = str(year-1) + "-{:0>2d}".format(self.get_last_year(year))

            self.get_season(driver,year_range,reg_season)

        return

In [61]:
class Team_standings:
    
    def __init__(self):
        self.REG = "002"
        self.POST = "004"
        self.teams = Teams()
        self.db_columns = ['SEASON_ID', 'TEAM_ID', 'GP', 'W', 'L', 'WINP', 'MINS', 'PTS', 'FGM',
                        'FGA', 'FGP', 'PM3', 'PA3', 'P3P', 'FTM', 'FTA', 'FTP', 'OREB', 'DREB',
                        'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD']
        self.engine = create_engine("mariadb+mariadbconnector://"\
                                  +environ.get("USER")+":"\
                                  +environ.get("PSWD")+"@127.0.0.1:3306/nba")
        
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100

    
    def write(self,html,tids,season_id):
        df = pd.read_html(html)[0]
        df = df.drop(columns = ['TEAM','+/-'])
        
        d = dict(zip(df.columns,self.db_columns[2:]))
        df = df.rename(columns=d)
        
        df["TEAM_ID"] = tids
        df["SEASON_ID"] = season_id
        
        df = df.dropna('columns')
        
        df = df[self.db_columns]
        df = df.drop_duplicates()
        df.to_sql("Team_standings",self.engine,index=False, if_exists="append")
        
        return 

    def get_season(self, driver, year,season_id, reg_season=True):
        
        url = self.teams.build_url(year,reg_season)
        
        print(url)
        
        html,tids = self.teams.get_source_and_teams(url, driver)
        
        self.write(html,tids,season_id)
        
        return
    
    def get_team_standings(self,last_n_years):
        this_year = strftime("%Y",localtime())
        year = int(this_year) -  1
        
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                id_year = "{:0>2d}".format(self.get_last_year(y-1))
                
                select = "select SEASON_ID from Team_standings "
                condition = "where SEASON_ID LIKE '%" + id_year+"'"
                limit = " limit 10"
                
                query = select +condition +limit
                d = pd.read_sql(query,engine)
                if(len(d)== 0):# there is no data on this year, get it
                    year_range = str(y-1) + "-{:0>2d}".format(self.get_last_year(y))
                    season_id = self.REG + id_year
                    self.get_season(driver,year_range,season_id)

                    season_id = self.POST + id_year
                    self.get_season(driver,year_range,season_id,reg_season=False)

        
        return

In [19]:
class Player_stats:
    def __init__(self):
        self.players = Players()        
        self.root = "../player_stats"
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/playoffs.csv")
        self.engine = create_engine("mariadb+mariadbconnector://"\
                                  +environ.get("USER")+":"\
                                  +environ.get("PSWD")+"@127.0.0.1:3306/nba")
        
        
    def write(self,html,pids,tids,fp):
        df = pd.read_html(html)[0]
        df["PLAYER_ID"] = pids
        df["TEAM_ID"] = tids
        
        df = df.dropna('columns')
        df.to_csv(fp,index = False)
        return 
    
    def get_last_year(self,year):
        if(year > 2000):
            return year % 2000
        return year % 100
        
    def get_season(self, driver, year,fp, reg_season=True):
        
        url = self.players.build_url(year,reg_season)
        
        print(url)
        while(1<2):
            try:
                html = self.players.click_all(url, driver)
                pids, tids = self.players.get_player_and_team_ids(html)
                break
            except:
                print("Error. Trying again.")
        
        self.write(html,pids,tids,fp)
        
        return
        
       
    def get_player_stats(self,last_n_years):
        year = int(strftime("%Y",localtime()))-1
        
        with webdriver.Chrome() as driver:
            for y in range(year,year-last_n_years-1,-1):
                d = 

                if(not exists(file_dir)):# there is no data on this year, get it
                    year_range = str(y-1) + "-{:0>2d}".format(self.get_last_year(y))

                    fp = file_dir + self.labels[self.REG]
                    self.get_season(driver,year_range,fp)


                    fp = file_dir + self.labels[self.POST]
                    self.get_season(driver,year_range,fp,reg_season=False)

        return

In [20]:
bs = Box()

In [21]:
bs.get_player_stats(2022)

In [22]:
engine = create_engine("mariadb+mariadbconnector://"\
                                  +environ.get("USER")+":"\
                                  +environ.get("PSWD")+"@127.0.0.1:3306/nba")

In [29]:
d = pd.read_sql("select SEASON_ID from Team_standings where SEASON_ID = 00495 limit 10",engine)

In [30]:
d.columns

Index(['SEASON_ID', 'TEAM_ID', 'GP', 'W', 'L', 'WINP', 'MINS', 'PTS', 'FGM',
       'FGA', 'FGP', 'PM3', 'PA3', 'P3P', 'FTM', 'FTA', 'FTP', 'OREB', 'DREB',
       'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD'],
      dtype='object')

In [32]:
len(d)

0

In [62]:
ts = Team_standings()

In [None]:
ts.get_team_standings(25)

https://www.nba.com/stats/teams/traditional/?Season=1995-96&SeasonType=Regular%20Season




https://www.nba.com/stats/teams/traditional/?Season=1995-96&SeasonType=Playoffs
