### Running some Sanity checks 

In [1]:
import pandas as pd
from pathlib import Path

# Notebook lives in Main_Project/notebooks
PROJECT_DIR = Path.cwd().parents[0]          # .../Main_Project
PROCESSED_DIR = PROJECT_DIR / "data" / "processed"

PLAYERS_PATH = PROCESSED_DIR / "dim_players.parquet"
FACT_PATH = PROCESSED_DIR / "fact_player_game.parquet"
TRENDS_PATH = PROCESSED_DIR / "mart_player_trends.parquet"

print("PROJECT_DIR:", PROJECT_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)
print("Exists:", PROCESSED_DIR.exists())
print("Files:", [p.name for p in PROCESSED_DIR.glob("*")])


PROJECT_DIR: /Users/andrescastellanos/Desktop/Data_Eng_1/DS6600Project/Main_Project
PROCESSED_DIR: /Users/andrescastellanos/Desktop/Data_Eng_1/DS6600Project/Main_Project/data/processed
Exists: True
Files: ['mart_player_trends.parquet', 'fact_player_game.parquet', 'dim_players.parquet']


## Loading in our data tables

- Will include this later into my report

In [2]:
players = pd.read_parquet(PLAYERS_PATH)
games = pd.read_parquet(FACT_PATH)
trends = pd.read_parquet(TRENDS_PATH)

print("players:", players.shape)
print("games:", games.shape)
print("trends:", trends.shape)


players: (10, 4)
games: (1344, 21)
trends: (1344, 24)


## Visualizing the schema to see what it looks like

In [3]:
def schema_table(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame({
        "column": df.columns,
        "dtype": [str(t) for t in df.dtypes],
        "non_null": df.notna().sum().values,
        "null_pct": (df.isna().mean().values * 100).round(2),
    })

schema_players = schema_table(players)
schema_games = schema_table(games)
schema_trends = schema_table(trends)

schema_players


Unnamed: 0,column,dtype,non_null,null_pct
0,player_id,int64,10,0.0
1,player_name,object,10,0.0
2,full_name_registry,object,10,0.0
3,is_active,bool,10,0.0


## Checking the coverage of the data (dates + seasons)

In [4]:
# Ensure game_date is datetime
games["game_date"] = pd.to_datetime(games["game_date"], errors="coerce")

coverage = {
    "min_date": games["game_date"].min(),
    "max_date": games["game_date"].max(),
    "seasons": sorted(games["season"].dropna().unique().tolist()),
    "num_games_rows": len(games)
}
coverage


{'min_date': Timestamp('2023-10-24 00:00:00'),
 'max_date': Timestamp('2025-04-13 00:00:00'),
 'seasons': ['2023-24', '2024-25'],
 'num_games_rows': 1344}

## Row count by player and season

In [5]:
counts = (
    games.groupby(["player_name", "season"])
    .size()
    .reset_index(name="n_games")
    .sort_values(["season", "n_games"], ascending=[True, False])
)

counts


Unnamed: 0,player_name,season,n_games
0,Anthony Edwards,2023-24,79
14,Nikola Jokic,2023-24,79
8,Kevin Durant,2023-24,75
16,Shai Gilgeous-Alexander,2023-24,75
4,Jayson Tatum,2023-24,74
18,Stephen Curry,2023-24,74
2,Giannis Antetokounmpo,2023-24,73
10,LeBron James,2023-24,71
12,Luka Doncic,2023-24,70
6,Joel Embiid,2023-24,39


## Checking for any duplicates in the data 

In [6]:
key_cols = [c for c in ["player_id", "game_id", "season"] if c in games.columns]
dupe_rows = games.duplicated(subset=key_cols).sum() if key_cols else games.duplicated().sum()

{"key_cols": key_cols, "duplicate_rows": int(dupe_rows)}


{'key_cols': ['player_id', 'game_id', 'season'], 'duplicate_rows': 0}

## Any missing data 

In [7]:
missing = (
    games.isna().mean()
    .mul(100)
    .round(2)
    .sort_values(ascending=False)
)

missing.head(20)


player_id     0.0
fta           0.0
season        0.0
plus_minus    0.0
pts           0.0
tov           0.0
blk           0.0
stl           0.0
ast           0.0
reb           0.0
ftm           0.0
game_id       0.0
fg3a          0.0
fg3m          0.0
fga           0.0
fgm           0.0
min           0.0
wl            0.0
matchup       0.0
game_date     0.0
dtype: float64

## Looking at the rolling averages
- roll10 = average over last 10 games

In [8]:
trends["game_date"] = pd.to_datetime(trends["game_date"], errors="coerce")

p = players["player_name"].iloc[0]
cols = [c for c in ["game_date", "season", "pts", "pts_roll10", "reb", "reb_roll10", "ast", "ast_roll10"] if c in trends.columns]

trends[trends["player_name"] == p].sort_values("game_date")[cols].tail(15)


Unnamed: 0,game_date,season,pts,pts_roll10,reb,reb_roll10,ast,ast_roll10
754,2025-03-05,2024-25,22,24.8,15,13.2,6,10.5
755,2025-03-07,2024-25,31,25.3,21,14.2,22,11.8
756,2025-03-09,2024-25,24,23.7,13,14.8,9,11.9
757,2025-03-10,2024-25,35,24.6,18,15.1,8,11.7
758,2025-03-12,2024-25,34,25.1,8,14.2,4,11.2
759,2025-03-14,2024-25,28,26.7,7,13.6,5,10.7
760,2025-03-15,2024-25,40,28.9,13,14.0,9,9.7
761,2025-03-26,2024-25,39,29.6,10,13.6,10,9.7
762,2025-03-28,2024-25,27,30.0,14,13.3,6,8.8
763,2025-04-01,2024-25,61,34.1,10,12.9,10,8.9
