# EDA of Games

In [11]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [12]:
df = pd.read_csv("../data/raw/games.csv")

df.head()

Unnamed: 0,seasonStartYear,awayTeam,pointsAway,homeTeam,pointsHome,attendance,notes,startET,datetime,isRegular,game_id
0,1996,Sacramento Kings,85,Houston Rockets,96,16285.0,,,1996-11-01,1,1
1,1996,Los Angeles Clippers,97,Golden State Warriors,85,15593.0,,,1996-11-01,1,2
2,1996,Portland Trail Blazers,114,Vancouver Grizzlies,85,19193.0,,,1996-11-01,1,3
3,1996,Seattle SuperSonics,91,Utah Jazz,99,19911.0,,,1996-11-01,1,4
4,1996,New York Knicks,107,Toronto Raptors,99,28457.0,,,1996-11-01,1,5


In [13]:
df = df[(df["isRegular"] == 1)]

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 28316 entries, 0 to 30165
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   seasonStartYear  28316 non-null  int64  
 1   awayTeam         28316 non-null  object 
 2   pointsAway       28316 non-null  int64  
 3   homeTeam         28316 non-null  object 
 4   pointsHome       28316 non-null  int64  
 5   attendance       28229 non-null  float64
 6   notes            103 non-null    object 
 7   startET          24024 non-null  object 
 8   datetime         28316 non-null  object 
 9   isRegular        28316 non-null  int64  
 10  game_id          28316 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 2.6+ MB


In [15]:
columns_to_drop = ["attendance", "notes", "startET", "isRegular", "seasonStartYear"]

df = df.drop(columns=columns_to_drop, axis="columns")

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28316 entries, 0 to 30165
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   awayTeam    28316 non-null  object
 1   pointsAway  28316 non-null  int64 
 2   homeTeam    28316 non-null  object
 3   pointsHome  28316 non-null  int64 
 4   datetime    28316 non-null  object
 5   game_id     28316 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 1.5+ MB


> Removed uncessary columns.

In [16]:
dict_for_renaming_columns = {
    "awayTeam": "away_team ",
    "pointsAway": "points_away ",
    "homeTeam": "home_team",
    "pointsHome": "points_home ",
    "datetime": "date_time"
}

df = df.rename(columns=dict_for_renaming_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28316 entries, 0 to 30165
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   away_team     28316 non-null  object
 1   points_away   28316 non-null  int64 
 2   home_team     28316 non-null  object
 3   points_home   28316 non-null  int64 
 4   date_time     28316 non-null  object
 5   game_id       28316 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 1.5+ MB


> Columns have been renamed for consistency. Some column names have been intentionally messed up to add more cleaning.

In [17]:
df.duplicated().sum()

np.int64(0)

> No duplicates in the dataset.

In [18]:
# Trim leading and trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Trim leading and trailing whitespaces in rows
df[df.select_dtypes(include="object").columns] = (
    df.select_dtypes(include="object").apply(lambda x: x.str.strip())
)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28316 entries, 0 to 30165
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   away_team    28316 non-null  object
 1   points_away  28316 non-null  int64 
 2   home_team    28316 non-null  object
 3   points_home  28316 non-null  int64 
 4   date_time    28316 non-null  object
 5   game_id      28316 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 1.5+ MB


> All leading and trailing whitespaces have been removed from the dataset.

In [32]:
# Change date_time to datetime object
df["date_time"] = pd.to_datetime(df["date_time"], format="%Y-%m-%d")

# Filter only dates between 2015 and 2019
df_filtered = df[(df["date_time"].dt.year >= 2016) & (df["date_time"].dt.year <= 2020)]

df_filtered.loc[:, 'year'] = pd.to_datetime(df['date_time'], format='%d-%m-%Y', errors='coerce').dt.year

df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5490 entries, 24347 to 30165
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   away_team    5490 non-null   object        
 1   points_away  5490 non-null   int64         
 2   home_team    5490 non-null   object        
 3   points_home  5490 non-null   int64         
 4   date_time    5490 non-null   datetime64[ns]
 5   game_id      5490 non-null   int64         
 6   year         5490 non-null   int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 321.7+ KB


> Filtered out dates between 2016 and 2020.

In [22]:
df_filtered

Unnamed: 0,away_team,points_away,home_team,points_home,date_time,game_id
24347,Dallas Mavericks,82,Miami Heat,106,2016-01-01,24348
24348,Charlotte Hornets,94,Toronto Raptors,104,2016-01-01,24349
24349,Philadelphia 76ers,84,Los Angeles Lakers,93,2016-01-01,24350
24350,New York Knicks,81,Chicago Bulls,108,2016-01-01,24351
24351,Orlando Magic,91,Washington Wizards,103,2016-01-01,24352
...,...,...,...,...,...,...
30161,Dallas Mavericks,102,Phoenix Suns,128,2020-08-13,30162
30162,Miami Heat,92,Indiana Pacers,109,2020-08-14,30163
30163,Oklahoma City Thunder,103,Los Angeles Clippers,107,2020-08-14,30164
30164,Philadelphia 76ers,134,Houston Rockets,96,2020-08-14,30165


In [21]:
# Save the transformed dataset to a CSV
FILE_PATH = "../tests/test_data/test_cleaned_games.csv"
df_filtered.to_csv(FILE_PATH, index=False)