# EDA of Box Scores

In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [None]:
df = pd.read_csv("../data/raw/boxscore.csv")

df.info()

In [None]:
columns_to_drop = ["ORB", "DRB", "STL", "BLK", "TOV", "PF", "+/-"]

df = df.drop(columns=columns_to_drop, axis="columns")

df.info()

> After removing unnecessary columns, there are no null or missing values in the data.

In [None]:
conditions = ["Did Not Play", "Player Suspended", "Not With Team", "Did Not Dress"]

# Find rows where minutes_played == "Did Not Play" or "Player Suspended"
mask = df["MP"].isin(conditions)

# Replace the values with 0 where the mask returns True
columns_to_replace = ["FG", "FGA", "3P", "3PA", "FT", "FTA", "TRB", "AST", "PTS"]
df.loc[mask, columns_to_replace] = 0

df.head(33)

> In the initial dataset, the "MP" column could be either "Did Not Play", "Player Suspended", "Not With Team", "Did Not Dress", this would also make the numeric columns "Did Not Play", "Player Suspended", "Not With Team", "Did Not Dress", respectively. This would lead to some errors when doing type conversions as the text would be converted to nulls.

In [None]:
columns_to_convert_to_int = ["FG", "FGA", "3P", "3PA", "FT", "FTA", "TRB", "AST", "PTS", "isStarter"]

# Change to numeric data type int64
df[columns_to_convert_to_int] = df[columns_to_convert_to_int].apply(pd.to_numeric, errors="coerce").astype("int64")

df.info()

> Numeric columns have been converted from `object` type to `int64`.

In [None]:
dict_for_renaming_columns = {
    "teamName": "team_name",
    "playerName": "player_name ",
    "MP": "minutes_played",
    "FG": "field_goals",
    "FGA": " field_goals_attempted ",
    "3P": "three_pointers",
    "3PA": "three_pointers_attempted ",
    "FT": "free_throws",
    "FTA": "free_throws_attempted",
    "ORB": "offensive_rebounds",
    "DRB": " defensive_rebounds ",
    "TRB": "total_rebounds",
    "AST": "assists",
    "PTS": "points",
    "isStarter": "is_starter"
}

df = df.rename(columns=dict_for_renaming_columns)

df.info()

> Columns have been renamed for better understanding. Some column names have been intentionally messed up to add more cleaning.

In [None]:
# Check for duplicated rows
df.duplicated().sum()

> There are no duplicated rows in the dataset.

In [None]:
# Trim leading and trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Trim leading and trailing whitespaces in rows
df[df.select_dtypes(include="object").columns] = (
    df.select_dtypes(include="object").apply(lambda x: x.str.strip())
)

df.head()

> All leading and trailing whitespaces have been removed from the dataset.

In [None]:
# Add a new column which calculates the field goals percentage
df["field_goals_percentage_%"] = (
    df["field_goals"] / df["field_goals_attempted"] * 100
).round(2)

# Change NaN values to 0
df.loc[df["field_goals_attempted"] == 0, "field_goals_percentage_%"] = 0

df.head()


> Added a new column which calculates the field goal percentage.

In [None]:
# Add a new column which calculates the three pointers percentage
df["three_point_percentage_%"] = (
    df["three_pointers"] / df["three_pointers_attempted"] * 100
).round(2)

# Change NaN values to 0
df.loc[df["three_pointers_attempted"] == 0, "three_point_percentage_%"] = 0

df.head()

> Added a new column which calculates the three point percentage

In [None]:
# Save the transformed dataset to a CSV
FILE_PATH = "../tests/test_data/test_cleaned_boxscores.csv"
df.to_csv(FILE_PATH, index=False)