In [None]:
# NBA Game Data Analysis

This notebook conducts a comprehensive analysis of NBA game data from the 2017-18 through 2023-24 seasons to identify patterns that will inform our prediction model. The data includes both regular season and playoff games for all 30 NBA teams.

## Objectives
- Understand the key factors that influence NBA game outcomes
- Identify statistical relationships between various metrics and winning
- Analyze contextual factors such as home court advantage, rest days, and streaks
- Develop insights that will guide our feature engineering for the prediction model

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from datetime import datetime

# Set visualization styles for better-looking plots
plt.style.use('fivethirtyeight')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Find and load the most recent cleaned game data file
clean_files = glob.glob('data/processed/nba_games_clean_*.csv')
if not clean_files:
    raise FileNotFoundError("No cleaned NBA game data files found. Please run the data collection script first.")
    
latest_file = max(clean_files, key=os.path.getctime)
print(f"Loading clean NBA data from: {latest_file}")

# Load the data
games_df = pd.read_csv(latest_file)

# Convert date column to datetime for time-based analysis
games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])

# Display basic information about the dataset
print(f"Dataset shape: {games_df.shape} (rows, columns)")
print(f"Seasons covered: {sorted(games_df['SEASON'].unique())}")
print(f"Date range: {games_df['GAME_DATE'].min().strftime('%Y-%m-%d')} to {games_df['GAME_DATE'].max().strftime('%Y-%m-%d')}")
print(f"Unique games: {games_df['GAME_ID'].nunique()}")
print(f"Unique teams: {games_df['TEAM_NAME'].nunique()}")

# Preview the first few rows
games_df.head()

Loading clean NBA data from: data/processed/nba_games_clean_20250318_170257.csv
Dataset shape: (17748, 31) (rows, columns)
Seasons covered: ['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']
Date range: 2017-10-17 to 2024-06-17
Unique games: 8874
Unique teams: 30


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,SEASON,GAME_TYPE,SEASON_TYPE
0,22017,1610612751,BKN,Brooklyn Nets,21701219,2018-04-11,BKN @ BOS,L,240,97,...,44,27,10,6,15,12,-13.0,2017-18,2,Regular Season
1,22017,1610612753,ORL,Orlando Magic,21701222,2018-04-11,ORL vs. WAS,W,239,101,...,42,20,6,7,16,27,9.0,2017-18,2,Regular Season
2,22017,1610612765,DET,Detroit Pistons,21701224,2018-04-11,DET @ CHI,W,240,119,...,47,27,12,4,11,21,32.0,2017-18,2,Regular Season
3,22017,1610612761,TOR,Toronto Raptors,21701221,2018-04-11,TOR @ MIA,L,265,109,...,47,24,5,10,13,24,-7.0,2017-18,2,Regular Season
4,22017,1610612739,CLE,Cleveland Cavaliers,21701220,2018-04-11,CLE vs. NYK,L,241,98,...,48,15,9,3,14,15,-12.0,2017-18,2,Regular Season
