In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import yaml
from IPython.display import Markdown, display
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from scipy.stats import linregress

In [2]:
# load yaml
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    DATA_PATH = config.get("data_path")
    if DATA_PATH is None:
        print("ERROR: No data path provided")
    USE_DRIVE = bool(config.get("use_drive", False))

In [3]:
# load from drive if requested
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


# Load data

In [8]:
team_stat_df = pd.read_csv(os.path.join(DATA_PATH, "team_statistics_important_features.csv"))
team_stat_df["gameDate"] = pd.to_datetime(team_stat_df["gameDate"])
team_stat_df.sort_index(inplace=True, ascending=False)
team_stat_df.set_index("gameDate", inplace=True)
team_stat_df["gameDateSave"] = team_stat_df.index
team_stat_df.head()

Unnamed: 0_level_0,gameId,teamCity,teamName,opponentTeamCity,opponentTeamName,home,win,teamScore,opponentScore,threePointersPercentage,...,stealsPerPossession,threePointersAttemptedPerPossession,freeThrowsAttemptedPerPossession,reboundsDefensivePerPossession,reboundsOffensivePerPossession,foulsPersonalPerPossession,turnoversPerPossession,effectiveFieldGoalPercentage,trueShootingPercentage,gameDateSave
gameDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-11-03 19:00:00,22300001,Indiana,Pacers,Cleveland,Cavaliers,1,1,121,116,0.484,...,0.06448,0.285556,0.221076,0.303979,0.06448,0.165807,0.175018,45.180233,0.626553,2023-11-03 19:00:00
2023-11-03 19:00:00,22300001,Cleveland,Cavaliers,Indiana,Pacers,0,0,116,121,0.286,...,0.097087,0.271845,0.242718,0.291262,0.048544,0.194175,0.126214,44.166667,0.610526,2023-11-03 19:00:00
2023-11-03 19:30:00,22300002,New York,Knicks,Milwaukee,Bucks,0,0,105,110,0.256,...,0.058824,0.382353,0.245098,0.392157,0.156863,0.215686,0.107843,38.203125,0.490654,2023-11-03 19:30:00
2023-11-03 19:30:00,22300002,Milwaukee,Bucks,New York,Knicks,1,1,110,105,0.513,...,0.059218,0.384919,0.276352,0.33557,0.069088,0.157916,0.138176,35.237805,0.583121,2023-11-03 19:30:00
2023-11-03 20:00:00,22300005,Oklahoma City,Thunder,Golden State,Warriors,1,0,139,141,0.517,...,0.027726,0.268022,0.277264,0.231054,0.064695,0.184843,0.175601,50.174699,0.722453,2023-11-03 20:00:00


In [9]:
team_stat_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 90626 entries, 2023-11-03 19:00:00 to 2025-03-26 22:00:00
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   gameId                               90626 non-null  int64         
 1   teamCity                             90626 non-null  object        
 2   teamName                             90626 non-null  object        
 3   opponentTeamCity                     90626 non-null  object        
 4   opponentTeamName                     90626 non-null  object        
 5   home                                 90626 non-null  int64         
 6   win                                  90626 non-null  int64         
 7   teamScore                            90626 non-null  int64         
 8   opponentScore                        90626 non-null  int64         
 9   threePointersPercentage              90626 non-n