# EDA of Games

In [117]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [118]:
df = pd.read_csv("../data/raw/games.csv")

df.head()

Unnamed: 0,seasonStartYear,awayTeam,pointsAway,homeTeam,pointsHome,attendance,notes,startET,datetime,isRegular,game_id
0,1996,Sacramento Kings,85,Houston Rockets,96,16285.0,,,1996-11-01,1,1
1,1996,Los Angeles Clippers,97,Golden State Warriors,85,15593.0,,,1996-11-01,1,2
2,1996,Portland Trail Blazers,114,Vancouver Grizzlies,85,19193.0,,,1996-11-01,1,3
3,1996,Seattle SuperSonics,91,Utah Jazz,99,19911.0,,,1996-11-01,1,4
4,1996,New York Knicks,107,Toronto Raptors,99,28457.0,,,1996-11-01,1,5


In [119]:
columns_to_drop = ["attendance", "notes", "startET", "isRegular"]

df = df.drop(columns=columns_to_drop, axis="columns")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30250 entries, 0 to 30249
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   seasonStartYear  30250 non-null  int64 
 1   awayTeam         30250 non-null  object
 2   pointsAway       30250 non-null  int64 
 3   homeTeam         30250 non-null  object
 4   pointsHome       30250 non-null  int64 
 5   datetime         30250 non-null  object
 6   game_id          30250 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 1.6+ MB


> Removed uncessary columns.

In [120]:
dict_for_renaming_columns = {
    "seasonStartYear": "season_start_year ",
    "awayTeam": "away_team ",
    "pointsAway": "points_away ",
    "homeTeam": "home_team",
    "pointsHome": "points_home ",
    "datetime": "date_time"
}

df = df.rename(columns=dict_for_renaming_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30250 entries, 0 to 30249
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   season_start_year   30250 non-null  int64 
 1   away_team           30250 non-null  object
 2   points_away         30250 non-null  int64 
 3   home_team           30250 non-null  object
 4   points_home         30250 non-null  int64 
 5   date_time           30250 non-null  object
 6   game_id             30250 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 1.6+ MB


> Columns have been renamed for consistency. Some column names have been intentionally messed up to add more cleaning.

In [121]:
df.duplicated().sum()

np.int64(0)

> No duplicates in the dataset.

In [122]:
# Trim leading and trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Trim leading and trailing whitespaces in rows
df[df.select_dtypes(include="object").columns] = (
    df.select_dtypes(include="object").apply(lambda x: x.str.strip())
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30250 entries, 0 to 30249
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   season_start_year  30250 non-null  int64 
 1   away_team          30250 non-null  object
 2   points_away        30250 non-null  int64 
 3   home_team          30250 non-null  object
 4   points_home        30250 non-null  int64 
 5   date_time          30250 non-null  object
 6   game_id            30250 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 1.6+ MB


> All leading and trailing whitespaces have been removed from the dataset.

In [123]:
# Change date_time to datetime object
df["date_time"] = pd.to_datetime(df["date_time"], format="%Y-%m-%d")

# Change the format to DD-MM-YYYY
df["date_time"] = df["date_time"].dt.strftime("%d-%m-%Y")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30250 entries, 0 to 30249
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   season_start_year  30250 non-null  int64 
 1   away_team          30250 non-null  object
 2   points_away        30250 non-null  int64 
 3   home_team          30250 non-null  object
 4   points_home        30250 non-null  int64 
 5   date_time          30250 non-null  object
 6   game_id            30250 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 1.6+ MB


> All dates have been converted to DD-MM-YYY format.

In [128]:
# Filter only dates between 2014 and 2019
df_filtered = df[(df["season_start_year"] >= 2015) & (df["season_start_year"] <= 2019)]

df_filtered.head(10)

Unnamed: 0,season_start_year,away_team,points_away,home_team,points_home,date_time,game_id
23858,2015,Cleveland Cavaliers,95,Chicago Bulls,97,27-10-2015,23859
23859,2015,Detroit Pistons,106,Atlanta Hawks,94,27-10-2015,23860
23860,2015,New Orleans Pelicans,95,Golden State Warriors,111,27-10-2015,23861
23861,2015,Minnesota Timberwolves,112,Los Angeles Lakers,111,28-10-2015,23862
23862,2015,Los Angeles Clippers,111,Sacramento Kings,104,28-10-2015,23863
23863,2015,New Orleans Pelicans,94,Portland Trail Blazers,112,28-10-2015,23864
23864,2015,Dallas Mavericks,111,Phoenix Suns,95,28-10-2015,23865
23865,2015,Denver Nuggets,105,Houston Rockets,85,28-10-2015,23866
23866,2015,San Antonio Spurs,106,Oklahoma City Thunder,112,28-10-2015,23867
23867,2015,Cleveland Cavaliers,106,Memphis Grizzlies,76,28-10-2015,23868


> Filtered out games between 2014 and 2019.

In [129]:
# Save the transformed dataset to a CSV
FILE_PATH = "../tests/test_data/test_cleaned_games.csv"
df_filtered.to_csv(FILE_PATH, index=False)