In [1]:
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
import pandas as pd
from os import environ
from os import walk
from os.path import join

In [2]:
from sqlalchemy.sql import exists
from sqlalchemy import insert

In [3]:
import re

In [2]:
engine = sqlalchemy.create_engine("mariadb+mariadbconnector://"\
                                  +environ.get("USER")+":"\
                                  +environ.get("PSWD")+"@127.0.0.1:3306/nba")

In [5]:
Base = declarative_base()

In [6]:
class Players(Base):
    __tablename__ = "Players"
    ID = sqlalchemy.Column(sqlalchemy.Integer, primary_key = True)
    Name = sqlalchemy.Column(sqlalchemy.String(length=255))

In [7]:
class Teams(Base):
    __tablename__ = "Teams"
    ID = sqlalchemy.Column(sqlalchemy.Integer, primary_key = True)
    Name = sqlalchemy.Column(sqlalchemy.String(length=255))

In [16]:
class Connector:
    def __init__(self):
        Base.metadata.create_all(engine)
        Session = sqlalchemy.orm.sessionmaker()
        Session.configure(bind=engine)
        self.session = Session()
        
    def add_players(self, df):
        
        ids = df["PLAYER_ID"]
        names = df["PLAYER"]
        for (i,p) in zip(ids,names):
            e = self.session.query(exists().where(Players.ID == i)).scalar()
            if(not e):
                player = Players(ID = i, Name = p)
                self.session.add(player)
                self.session.commit()
        return
                
    def add_teams(self, df):
        
        ids = df["TEAM_ID"].unique()
        names = df["TEAM"].unique()
        for (i,t) in zip(ids,names):
            e = self.session.query(exists().where(Teams.ID == int(i))).scalar()
            if(not e):
                team = Teams(ID = int(i), Name = t)
                self.session.add(team)
                self.session.commit()
                
        return
    
    def add_seasonal_performances(self, df,mode,year):
        seasons = {"preseason":"001",
                   "regular_season":"002",
                   "all_star":"003",
                   "playoffs":"004",
                   "play_in":"005"}
        
        
        cols = ['PLAYER_ID','TEAM_ID','AGE','GP','W','L','MIN','PTS','FGM','FGA',\
        'FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB','AST',\
        'TOV','STL','BLK','PF','DD2','TD3']
        
        db_cols = ['PLAYER_ID','TEAM_ID','AGE','GP','W','L','MINS','PTS','FGM','FGA',\
           'FGP','PM3','PA3','P3P','FTM','FTA','FTP','OREB','DREB','REB','AST',\
           'TOV','STL','BLK','PF','DD2','TD3']
        
        df[["FG%","3P%","FT%"]] /= 100
        
        d = dict(zip(cols,db_cols))
        
        df = df[cols]
        df = df.rename(columns=d) 
        df["Season_ID"] = seasons[mode]+year
        df.to_sql("Seasonal_performance",engine,if_exists="append", index = False)
        
        return
    
    def add_team_standings(self, df,mode,year):
        seasons = {"preseason":"001",
                   "regular_season":"002",
                   "all_star":"003",
                   "playoffs":"004",
                   "play_in":"005"}
        
        
        cols = ['TEAM_ID','GP','W','L','WIN%','MIN','PTS','FGM','FGA',\
        'FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB','AST',\
        'TOV','STL','BLK','BLKA','PF','PFD']
        
        db_cols = ['TEAM_ID','GP','W','L','WINP','MINS','PTS','FGM','FGA',\
           'FGP','PM3','PA3','P3P','FTM','FTA','FTP','OREB','DREB','REB','AST',\
           'TOV','STL','BLK','BLKA','PF','PFD']
        
        df[["FG%","3P%","FT%"]] /= 100
        
        d = dict(zip(cols,db_cols))
        
        df = df[cols]
        df = df.rename(columns=d)
        df.insert(0,"SEASON_ID",seasons[mode]+year)
        df.to_sql("Team_standings",engine,if_exists="append", index = False)
        
        return

In [17]:
def players_csv_to_db(c):
    for root,_, files in walk("../player_stats/"):
        for f in files:
            path = join(root,f)
            if(".csv" in path):
                df = pd.read_csv(path)
                c.add_players(df)

In [18]:
def teams_csv_to_db(c):
    for root,_, files in walk("../player_stats/"):
        for f in files:
            path = join(root,f)
            if("regular_season.csv" in path):
                df = pd.read_csv(path)
                c.add_teams(df)

In [19]:
def get_season_year_code(year):
    if(year == "00"):
        return 99
    
    return int(year)-1

In [20]:
def seasonals_csv_to_db(c):
    for root,_, files in walk("../player_stats/"):
        for f in files:
            path = join(root,f)
            year = re.search("(\d+)-(\d+)",path)
            if(year):
                year = get_season_year_code(year.group().split("-")[1])
                year = "{:0>2d}".format(year)
                if("regular_season.csv" in path):
                    df = pd.read_csv(path)
                    c.add_seasonal_performances(df,"regular_season",year)
                elif("playoffs.csv" in path):
                    df = pd.read_csv(path)
                    c.add_seasonal_performances(df,"playoffs",year)

In [21]:
def standings_csv_to_db(c):
    d = {}
    for root,_, files in walk("../team_standings/"):
        for f in files:
            path = join(root,f)
            year = re.search("(\d+)-(\d+)",path)
            if(year):
                year = get_season_year_code(year.group().split("-")[1])
                year = "{:0>2d}".format(year)
                if("regular_season.csv" in path):
                    df = pd.read_csv(path)
                    d = c.add_team_standings(df,"regular_season",year)
                elif("playoffs.csv" in path):
                    df = pd.read_csv(path)
                    d = c.add_team_standings(df,"playoffs",year)
    return d

In [22]:
c = Connector()

In [23]:
standings_csv_to_db(c)

In [None]:
-----------+---------------------+------+-----+---------+-------+
| Player_ID | int(11)             | NO   | MUL | NULL    |       |
| Team_ID   | int(11)             | NO   | MUL | NULL    |       |
| Game_ID   | char(10)            | NO   |     | NULL    |       |
| Matchup   | varchar(12)         | NO   |     | NULL    |       |
| Game_day  | date                | NO   |     | NULL    |       |
| Result    | char(1)             | NO   |     | NULL    |       |
| MINS      | tinyint(3) unsigned | YES  |     | NULL    |       |
| PTS       | tinyint(3) unsigned | YES  |     | NULL    |       |
| FGM       | tinyint(3) unsigned | YES  |     | NULL    |       |
| FGA       | tinyint(3) unsigned | YES  |     | NULL    |       |
| FGP       | float unsigned      | YES  |     | NULL    |       |
| PM3       | tinyint(3) unsigned | YES  |     | NULL    |       |
| PA3       | tinyint(3) unsigned | YES  |     | NULL    |       |
| P3P       | float unsigned      | YES  |     | NULL    |       |
| FTM       | tinyint(3) unsigned | YES  |     | NULL    |       |
| FTA       | tinyint(3) unsigned | YES  |     | NULL    |       |
| FTP       | float unsigned      | YES  |     | NULL    |       |
| OREB      | tinyint(3) unsigned | YES  |     | NULL    |       |
| DREB      | tinyint(3) unsigned | YES  |     | NULL    |       |
| REB       | tinyint(3) unsigned | YES  |     | NULL    |       |
| AST       | tinyint(3) unsigned | YES  |     | NULL    |       |
| TOV       | tinyint(3) unsigned | YES  |     | NULL    |       |
| STL       | tinyint(3) unsigned | YES  |     | NULL    |       |
| BLK       | tinyint(3) unsigned | YES  |     | NULL    |       |
| PF      

In [3]:
df = pd.read_sql("Box_scores",engine)

In [4]:
df.columns

Index(['Player_ID', 'Team_ID', 'Game_ID', 'Matchup', 'Game_day', 'Result',
       'MINS', 'PTS', 'FGM', 'FGA', 'FGP', 'PM3', 'PA3', 'P3P', 'FTM', 'FTA',
       'FTP', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF'],
      dtype='object')

In [40]:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [66]:
with webdriver.Chrome() as driver:
    driver.get("https://www.nba.com/stats/players/boxscores/?Season=2021-22&SeasonType=Regular%20Season")
    element = WebDriverWait(driver,20).until(EC.presence_of_element_located(
        (By.XPATH,"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table")))
    table = driver.find_element(By.XPATH,"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select")
#     s = Select(table)
    t = table.text
    html = driver.page_source

In [15]:
from bs4 import BeautifulSoup

In [31]:
from json import loads

In [44]:
soup = BeautifulSoup(html,'html.parser')
t = soup.find("div",attrs={"class":"nba-stat-table"})

In [68]:
t.split('\n')

['All',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '13

In [51]:
fd = pd.read_html(html)

In [53]:
fd[0]["Game Date"]. dtypes

Player        object
Team          object
Match Up      object
Game Date     object
Season       float64
W/L           object
MIN            int64
PTS            int64
FGM            int64
FGA            int64
FG%           object
3PM            int64
3PA            int64
3P%           object
FTM            int64
FTA            int64
FT%           object
OREB           int64
DREB           int64
REB            int64
AST            int64
STL            int64
BLK            int64
TOV            int64
PF             int64
+/-            int64
FP           float64
dtype: object

0    2022-02-01
1    2022-02-01
2    2022-02-01
3    2022-02-01
4    2022-02-01
5    2022-02-01
6    2022-02-01
7    2022-02-01
8    2022-02-01
9    2022-02-01
10   2022-02-01
11   2022-02-01
12   2022-02-01
13   2022-02-01
14   2022-02-01
15   2022-02-01
16   2022-02-01
17   2022-02-01
18   2022-02-01
19   2022-02-01
20   2022-02-01
21   2022-02-01
22   2022-02-01
23   2022-02-01
24   2022-02-01
25   2022-02-01
26   2022-02-01
27   2022-02-01
28   2022-02-01
29   2022-02-01
30   2022-02-01
31   2022-02-01
32   2022-02-01
33   2022-02-01
34   2022-02-01
35   2022-02-01
36   2022-02-01
37   2022-02-01
38   2022-02-01
39   2022-02-01
40   2022-02-01
41   2022-02-01
42   2022-02-01
43   2022-02-01
44   2022-02-01
45   2022-02-01
46   2022-02-01
47   2022-02-01
48   2022-02-01
49   2022-02-01
Name: Game Date, dtype: datetime64[ns]

Index(['Player', 'Team', 'Match Up', 'Game Date', 'Season', 'W/L', 'MIN',
       'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-', 'FP'],
      dtype='object')