In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import yaml

In [2]:
# load yaml
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    DATA_PATH = config.get("data_path")
    if DATA_PATH is None:
        print("ERROR: No data path provided")
    USE_DRIVE = bool(config.get("use_drive", False))

In [3]:
# load from drive if requested
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

In [4]:
NBA_BOX_SCORE_ZIP_PATH = os.path.join(DATA_PATH, "NBA_Boxscores.zip")

!unzip "{NBA_BOX_SCORE_ZIP_PATH}"

'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [19]:
team_statistics_df = pd.read_csv(os.path.join(DATA_PATH, "TeamStatistics.csv"))
player_statistics_df = pd.read_csv(os.path.join(DATA_PATH, "PlayerStatistics.csv"))

  player_statistics_df = pd.read_csv(os.path.join(DATA_PATH, "PlayerStatistics.csv"))


In [11]:
# Add gametype to team statistics
game_type_df = player_statistics_df[['gameId', 'gameType', 'gameSubLabel']].drop_duplicates()
team_statistics_df = team_statistics_df.merge(game_type_df, on='gameId', how='left')

# Include NBA Emirates Cup games, excluding the Championship Final (Dec 17, 2024) since they are part of the Regular Season
emirates_cup_non_final_df = team_statistics_df[
    (team_statistics_df['gameType'] == 'NBA Emirates Cup') &
    (team_statistics_df['gameSubLabel'] != 'Championship')
]

In [12]:
# Only have regular season games
team_statistics_df = team_statistics_df[team_statistics_df['gameType'] == 'Regular Season']
team_statistics_df = pd.concat([team_statistics_df, emirates_cup_non_final_df])
# remove any remaining Championship-labeled games (e.g. misclassified 2024 IST Final)
team_statistics_df = team_statistics_df[
    team_statistics_df['gameSubLabel'] != 'Championship'
].reset_index(drop=True)

In [13]:
# Drop unnecessary columns
team_statistics_df = team_statistics_df.drop(columns=['coachId', 'gameType', 'gameSubLabel'])

# Drop Advanced stats columns --> Too little sample size
team_statistics_df = team_statistics_df.drop(columns=['q1Points', 'q2Points', 'q3Points', 'q4Points', 'benchPoints', 'biggestLead', 'biggestScoringRun', 'leadChanges', 'pointsFastBreak', 'pointsFromTurnovers', 'pointsInThePaint', 'pointsSecondChance', 'timesTied', 'timeoutsRemaining', 'seasonWins', 'seasonLosses'])

In [14]:
team_statistics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130918 entries, 0 to 130917
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   gameId                   130918 non-null  int64  
 1   gameDate                 130918 non-null  object 
 2   teamCity                 130918 non-null  object 
 3   teamName                 130918 non-null  object 
 4   teamId                   130918 non-null  int64  
 5   opponentTeamCity         130918 non-null  object 
 6   opponentTeamName         130918 non-null  object 
 7   opponentTeamId           130918 non-null  int64  
 8   home                     130918 non-null  int64  
 9   win                      130918 non-null  int64  
 10  teamScore                130918 non-null  int64  
 11  opponentScore            130918 non-null  int64  
 12  assists                  97810 non-null   float64
 13  blocks                   92477 non-null   float64
 14  stea

In [20]:
# Ensure gameDate is a datetime and extract the year
game_type_df = player_statistics_df[['gameId', 'gameType', 'gameSubLabel']].drop_duplicates()
team_statistics_df = team_statistics_df.merge(game_type_df, on='gameId', how='left')
team_statistics_df = team_statistics_df[team_statistics_df['gameType'] == 'Regular Season']
team_statistics_df['gameDate'] = pd.to_datetime(team_statistics_df['gameDate'])
team_statistics_df['year'] = team_statistics_df['gameDate'].dt.year

# Group by team (using teamId) and year, and count total games
games_per_team_year = team_statistics_df.groupby(['teamName', 'year']).size().reset_index(name='total_games')

print(games_per_team_year.head())

  teamName  year  total_games
0    76ers  1963           33
1    76ers  1964           84
2    76ers  1965           77
3    76ers  1966           84
4    76ers  1967           82


In [None]:
team_statistics_df.info()
team_statistics_df = team_statistics_df.dropna()
team_statistics_df.info()

# Remove entries where freeThrowPercentage > 1.0
team_statistics_df = team_statistics_df[team_statistics_df['freeThrowsPercentage'] <= 1.0]
team_statistics_df = team_statistics_df[team_statistics_df['fieldGoalsPercentage'] <= 1.0]
team_statistics_df = team_statistics_df[team_statistics_df['threePointersPercentage'] <= 1.0]
team_statistics_df.info()

# Save as csv
REGULAR_SEASON_CSV_PATH = "/content/drive/MyDrive/Intro_Data_Science_Project/data/regular_season_team_df.csv"
#REGULAR_SEASON_CSV_PATH = "./../data/regular_season_team_df.csv"
team_statistics_df.to_csv(REGULAR_SEASON_CSV_PATH, index=False)
team_statistics_df.info