In [41]:
import numpy as np
import pandas as pd
import psycopg
from sqlalchemy import create_engine
import dotenv
import os
#import sqlite3

In [27]:
nba = pd.read_csv('ASA All NBA Raw Data.csv', low_memory=False)

In [28]:
pd.set_option('display.max_rows', 81)
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202202170BRK,202202170BRK,202202170BRK
game_date,2022-02-17,2022-02-17,2022-02-17
OT,0,0,0
H_A,A,A,A
Team_Abbrev,WAS,WAS,WAS
Team_Score,117,117,117
Team_pace,94.5,94.5,94.5
Team_efg_pct,0.627,0.627,0.627
Team_tov_pct,13.5,13.5,13.5
Team_orb_pct,22.9,22.9,22.9


## Database Normalization
### First normal form:

1. **All tables must have a primary key**: In this table, `game_id` and `player_id` together are unique on every row, and so they form primary key.

2. **All the data must be atomic**: Inactives is non-atomic.

3. **No repeating groups problem**: We can't solve the non-atomicity problem by creating separate columns if this leads to arbitrary ordering language in the column names (for example, `Inactive1`, `Inactive2`, etc.) and if it leads to a lot of missing data (there would be an `Inactive7` which would be missing any time a team has less than 7 inactive players).

In [29]:
nba = nba.drop(['Inactives'], axis=1)

### Functional Dependence
Let X and Y be columns in a data table. Y is functionally dependent on X if each value of X has exactly one value of Y.

That's pretty abstract. So here are some guidelines that help me:

1. This use of "function" is the exact same as the concept of a function from algebra and pre-calculus. A correspondence f(x)=y is a function if each value of x has only one associated value of y.

2. X is either a primary key, or something that should be a primary key in another table.

For example, `game_date` (Y) is functionally dependent on `game_id` (X) because one `game_id` takes place on exactly one date.

### Second normal form:
In this table the primary key is a superkey consisting of two columns: `game_id` and `player_id`. 

2NF is violated if any columns are functionally dependent on part of the primary key but not the entire primary key. This can only happen if the primary key is a superkey.

In [30]:
# you remove these tables and put them on a seprate table
# now this is our second table in our database
games = nba[['game_id', 'game_date', 'OT', 'season']].drop_duplicates()
games

Unnamed: 0,game_id,game_date,OT,season
0,202202170BRK,2022-02-17,0,2022
26,202202170CHO,2022-02-17,2,2022
48,202202170LAC,2022-02-17,0,2022
71,202202170MIL,2022-02-17,0,2022
95,202202170NOP,2022-02-17,0,2022
...,...,...,...,...
108259,202001080GSW,2020-01-08,0,2020
108887,202008020HOU,2020-08-02,0,2020
109683,201911060HOU,2019-11-06,0,2020
110125,201912250GSW,2019-12-25,0,2020


In [31]:
players = nba[['player_id', 'player']].drop_duplicates()
players

Unnamed: 0,player_id,player
0,kispeco01,Corey Kispert
1,kuzmaky01,Kyle Kuzma
2,caldwke01,Kentavious Caldwell-Pope
3,netora01,Raul Neto
4,bryanth01,Thomas Bryant
...,...,...
109702,frazimi01,Michael Frazier
110441,howarwi01,William Howard
110913,mbahalu01,Luc Mbah a Moute
111399,bowmaky01,Ky Bowman


In [32]:
nba = nba.drop(['game_date', 'OT', 'season', 'player'], axis=1)

### Third normal form:
3NF is violated if there are "transitive dependencies", that is, functional dependence between columns when neither column is part of the primary key.

In [33]:
nba.columns

Index(['game_id', 'H_A', 'Team_Abbrev', 'Team_Score', 'Team_pace',
       'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct', 'Team_ft_rate',
       'Team_off_rtg', 'Opponent_Abbrev', 'Opponent_Score', 'Opponent_pace',
       'Opponent_efg_pct', 'Opponent_tov_pct', 'Opponent_orb_pct',
       'Opponent_ft_rate', 'Opponent_off_rtg', 'player_id', 'starter', 'mp',
       'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct',
       'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts',
       'plus_minus', 'did_not_play', 'is_inactive', 'ts_pct', 'efg_pct',
       'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'off_rtg',
       'def_rtg', 'bpm', 'minutes', 'double_double', 'triple_double', 'DKP',
       'FDP', 'SDP', 'DKP_per_minute', 'FDP_per_minute', 'SDP_per_minute',
       'pf_per_minute', 'ts', 'last_60_minutes_per_game_starting',
       'last_60_minutes_per_game_bench', 'PG%', '

In [35]:
team_game = nba[['game_id', 'Team_Abbrev', 'Team_Score',
    'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Opponent_Abbrev']].drop_duplicates()
nba.drop(['Team_Score','Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Opponent_Abbrev', 'Opponent_Score',
       'Opponent_pace', 'Opponent_efg_pct', 'Opponent_tov_pct',
       'Opponent_orb_pct', 'Opponent_ft_rate', 'Opponent_off_rtg'], axis=1)
player_game = nba

In [36]:
# make all cols lower case for postgres
player_game.columns = [x.lower().replace('%', '_pct') for x in player_game.columns]
team_game.columns = [x.lower().replace('%', '_pct') for x in team_game.columns]
players.columns = [x.lower().replace('%', '_pct') for x in players.columns]
games.columns = [x.lower().replace('%', '_pct') for x in games.columns]

In [37]:
player_game.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112123 entries, 0 to 112122
Data columns (total 76 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   game_id                            112123 non-null  object 
 1   h_a                                112123 non-null  object 
 2   team_abbrev                        112123 non-null  object 
 3   team_score                         112123 non-null  int64  
 4   team_pace                          112123 non-null  float64
 5   team_efg_pct                       112123 non-null  float64
 6   team_tov_pct                       112123 non-null  float64
 7   team_orb_pct                       112123 non-null  float64
 8   team_ft_rate                       112123 non-null  float64
 9   team_off_rtg                       112123 non-null  float64
 10  opponent_abbrev                    112123 non-null  object 
 11  opponent_score                     1121

In [38]:
team_game.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6394 entries, 0 to 112112
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   game_id          6394 non-null   object 
 1   team_abbrev      6394 non-null   object 
 2   team_score       6394 non-null   int64  
 3   team_pace        6394 non-null   float64
 4   team_efg_pct     6394 non-null   float64
 5   team_tov_pct     6394 non-null   float64
 6   team_orb_pct     6394 non-null   float64
 7   team_ft_rate     6394 non-null   float64
 8   team_off_rtg     6394 non-null   float64
 9   opponent_abbrev  6394 non-null   object 
dtypes: float64(6), int64(1), object(3)
memory usage: 549.5+ KB


In [39]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 812 entries, 0 to 112021
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   player_id  812 non-null    object
 1   player     812 non-null    object
dtypes: object(2)
memory usage: 19.0+ KB


In [40]:
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3197 entries, 0 to 110643
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   game_id    3197 non-null   object
 1   game_date  3197 non-null   object
 2   ot         3197 non-null   int64 
 3   season     3197 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 124.9+ KB


```bash
(db_env) afnan@Afnans-Air M06 % dbdocs build nba_db.dbml
✔ Parsing file content
⚠ Project 'NBA' is public, consider setting password or restricting access to it
✔ Done. Visit: https://dbdocs.io/AfnanAbdul/NBA

ℹ Thanks for using dbdocs! We'd love to hear your feedback: https://form.jotform.com/200962053361448
```

In [42]:
dotenv.load_dotenv()

True

In [46]:
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')

In [50]:
# connect to postgres server
dbserver = psycopg.connect(
    user='postgres', 
    password=POSTGRES_PASSWORD, 
    host='localhost',
    port = '5432'
)
dbserver.autocommit = True

In [52]:
# create NBA database
cursor = dbserver.cursor()
try:
    cursor.execute('CREATE DATABASE nba')
except:
    cursor.execute('DROP DATABASE nba')
    cursor.execute('CREATE DATABASE nba')

In [64]:
# upload NBA dfs to NBA database
dbms = 'postgresql'
connector = 'psycopg'
user = 'postgres'
pw = POSTGRES_PASSWORD
host = 'localhost'
port = '5432'
database = 'nba'
engine_string = f'{dbms}+{connector}://{user}:{pw}@{host}:{port}/{database}'
engine_string

'postgresql+psycopg://postgres:bikrys-1nyvma-qykdUv@localhost:5432/nba'

In [65]:
engine = create_engine(engine_string)

In [77]:
player_game.to_sql('player_games', con=engine, chunksize=1000, if_exists='replace', index=False)

-113

In [78]:
player_game.shape

(112123, 76)

In [79]:
pd.read_sql_query('SELECT * FROM player_games', con=engine).shape

(112123, 76)

In [83]:
team_game.to_sql('team_game', con=engine, chunksize=1000, if_exists='replace', index=False)
players.to_sql('players', con=engine, chunksize=1000, if_exists='replace', index=False)
games.to_sql('games', con=engine, chunksize=1000, if_exists='replace', index=False)

-4

In [84]:
team_game.shape
players.shape
games.shape

(3197, 4)

In [85]:
pd.read_sql_query('SELECT * FROM team_game', con=engine).shape
pd.read_sql_query('SELECT * FROM players', con=engine).shape
pd.read_sql_query('SELECT * FROM games', con=engine).shape

(3197, 4)

In [86]:
myquery = '''
SELECT *
FROM games
'''
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,game_id,game_date,ot,season
0,202202170BRK,2022-02-17,0,2022
1,202202170CHO,2022-02-17,2,2022
2,202202170LAC,2022-02-17,0,2022
3,202202170MIL,2022-02-17,0,2022
4,202202170NOP,2022-02-17,0,2022
...,...,...,...,...
3192,202001080GSW,2020-01-08,0,2020
3193,202008020HOU,2020-08-02,0,2020
3194,201911060HOU,2019-11-06,0,2020
3195,201912250GSW,2019-12-25,0,2020


In [88]:
myquery = '''
SELECT *
FROM player_games
WHERE pts > 60
'''
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,game_id,h_a,team_abbrev,team_score,team_pace,team_efg_pct,team_tov_pct,team_orb_pct,team_ft_rate,team_off_rtg,...,pf_per_minute,ts,last_60_minutes_per_game_starting,last_60_minutes_per_game_bench,pg_pct,sg_pct,sf_pct,pf_pct,c_pct,active_position_minutes
0,202101030GSW,H,GSW,137,107.2,0.631,12.7,28.2,0.295,127.8,...,0.0,39.36,32.836667,16.089368,82.0,18.0,0.0,0.0,0.0,51.151069
1,202001200POR,H,POR,129,100.3,0.51,9.6,22.6,0.27,116.4,...,0.022157,44.04,35.911905,20.859681,100.0,0.0,0.0,0.0,0.0,221.003792
2,202001200POR,H,POR,129,100.3,0.51,9.6,22.6,0.27,116.4,...,0.022157,44.04,35.911905,20.859681,100.0,0.0,0.0,0.0,0.0,221.003792
3,202008110DAL,A,POR,134,104.3,0.546,6.7,27.3,0.276,128.5,...,0.048682,39.92,41.463889,24.08459,100.0,0.0,0.0,0.0,0.0,205.0125
4,202008110DAL,A,POR,134,104.3,0.546,6.7,27.3,0.276,128.5,...,0.048682,39.92,41.463889,24.08459,100.0,0.0,0.0,0.0,0.0,205.0125
