# EDA of Box Scores

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [68]:
df = pd.read_csv("../data/raw/boxscore.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741569 entries, 0 to 741568
Data columns (total 21 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   game_id     741569 non-null  int64 
 1   teamName    741569 non-null  object
 2   playerName  741569 non-null  object
 3   MP          741569 non-null  object
 4   FG          741569 non-null  object
 5   FGA         741569 non-null  object
 6   3P          741569 non-null  object
 7   3PA         741569 non-null  object
 8   FT          741569 non-null  object
 9   FTA         741569 non-null  object
 10  ORB         741569 non-null  object
 11  DRB         741569 non-null  object
 12  TRB         741569 non-null  object
 13  AST         741569 non-null  object
 14  STL         741569 non-null  object
 15  BLK         741569 non-null  object
 16  TOV         741569 non-null  object
 17  PF          741569 non-null  object
 18  PTS         741569 non-null  object
 19  +/-         741449 non-

In [69]:
columns_to_drop = ["ORB", "DRB", "STL", "BLK", "TOV", "PF", "+/-"]

df = df.drop(columns=columns_to_drop, axis="columns")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741569 entries, 0 to 741568
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   game_id     741569 non-null  int64 
 1   teamName    741569 non-null  object
 2   playerName  741569 non-null  object
 3   MP          741569 non-null  object
 4   FG          741569 non-null  object
 5   FGA         741569 non-null  object
 6   3P          741569 non-null  object
 7   3PA         741569 non-null  object
 8   FT          741569 non-null  object
 9   FTA         741569 non-null  object
 10  TRB         741569 non-null  object
 11  AST         741569 non-null  object
 12  PTS         741569 non-null  object
 13  isStarter   741569 non-null  int64 
dtypes: int64(2), object(12)
memory usage: 79.2+ MB


> After removing unnecessary columns, there are no null or missing values in the data.

In [77]:
conditions = ["Did Not Play", "Player Suspended", "Not With Team", "Did Not Dress"]

# Find rows where minutes_played == "Did Not Play" or "Player Suspended"
mask = df["MP"].isin(conditions)

# Replace the values with 0 where the mask returns True
columns_to_replace = ["FG", "FGA", "3P", "3PA", "FT", "FTA", "TRB", "AST", "PTS"]
df.loc[mask, columns_to_replace] = 0

df.head(33)

Unnamed: 0,game_id,teamName,playerName,MP,FG,FGA,3P,3PA,FT,FTA,TRB,AST,PTS,isStarter
0,1,Sacramento Kings,Corliss Williamson,37:20,7,11,0,0,0,0,4,4,14,1
1,1,Sacramento Kings,Mitch Richmond,32:00,6,12,1,4,1,1,5,3,14,1
2,1,Sacramento Kings,Olden Polynice,31:34,0,4,0,0,1,4,7,3,1,1
3,1,Sacramento Kings,Mahmoud Abdul-Rauf,29:27,7,13,1,2,2,2,2,5,17,1
4,1,Sacramento Kings,Brian Grant,25:13,3,11,0,0,2,2,6,0,8,1
5,1,Sacramento Kings,Tyus Edney,25:29,4,11,0,2,6,8,1,0,14,0
6,1,Sacramento Kings,Michael Smith,24:45,2,3,0,0,4,4,10,1,8,0
7,1,Sacramento Kings,Lionel Simmons,18:00,2,6,2,4,0,0,2,1,6,0
8,1,Sacramento Kings,Duane Causwell,16:26,1,1,0,0,1,2,3,1,3,0
9,1,Houston Rockets,Clyde Drexler,42:36,8,21,2,5,7,10,10,9,25,1


> In the initial dataset, the "MP" column could be either "Did Not Play", "Player Suspended", "Not With Team", "Did Not Dress", this would also make the numeric columns "Did Not Play", "Player Suspended", "Not With Team", "Did Not Dress", respectively. This would lead to some errors when doing type conversions as the text would be converted to nulls.

In [80]:
columns_to_convert_to_int = ["FG", "FGA", "3P", "3PA", "FT", "FTA", "TRB", "AST", "PTS", "isStarter"]

# Change to numeric data type int64
df[columns_to_convert_to_int] = df[columns_to_convert_to_int].apply(pd.to_numeric, errors="coerce").astype("int64")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741569 entries, 0 to 741568
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   game_id     741569 non-null  int64 
 1   teamName    741569 non-null  string
 2   playerName  741569 non-null  string
 3   MP          741569 non-null  string
 4   FG          741569 non-null  int64 
 5   FGA         741569 non-null  int64 
 6   3P          741569 non-null  int64 
 7   3PA         741569 non-null  int64 
 8   FT          741569 non-null  int64 
 9   FTA         741569 non-null  int64 
 10  TRB         741569 non-null  int64 
 11  AST         741569 non-null  int64 
 12  PTS         741569 non-null  int64 
 13  isStarter   741569 non-null  int64 
dtypes: int64(11), string(3)
memory usage: 79.2 MB


> Numeric columns have been converted from `object` type to `int64`.

In [81]:
columns_to_convert_to_string = ["teamName", "playerName", "MP"]

df[columns_to_convert_to_string] = df[columns_to_convert_to_string].astype("string")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741569 entries, 0 to 741568
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   game_id     741569 non-null  int64 
 1   teamName    741569 non-null  string
 2   playerName  741569 non-null  string
 3   MP          741569 non-null  string
 4   FG          741569 non-null  int64 
 5   FGA         741569 non-null  int64 
 6   3P          741569 non-null  int64 
 7   3PA         741569 non-null  int64 
 8   FT          741569 non-null  int64 
 9   FTA         741569 non-null  int64 
 10  TRB         741569 non-null  int64 
 11  AST         741569 non-null  int64 
 12  PTS         741569 non-null  int64 
 13  isStarter   741569 non-null  int64 
dtypes: int64(11), string(3)
memory usage: 79.2 MB


> Text columns have been converted to `string` data type.

In [93]:
dict_for_renaming_columns = {
    "teamName": "team_name",
    "playerName": "player_name",
    "MP": "minutes_played",
    "FG": "field_goals",
    "FGA": "field_goals_attempted",
    "3P": "three_pointers",
    "3PA": "three_pointers_attempted",
    "FT": "free_throws",
    "FTA": "free_throws_attempted",
    "ORB": "offensive_rebounds",
    "DRB": "defensive_rebounds",
    "TRB": "total_rebounds",
    "AST": "assists",
    "PTS": "points",
    "isStarter": "is_starter"
}

df = df.rename(columns=dict_for_renaming_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741569 entries, 0 to 741568
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   game_id                   741569 non-null  int64 
 1   team_name                 741569 non-null  string
 2   player_name               741569 non-null  string
 3   minutes_played            741569 non-null  string
 4   field_goals               741569 non-null  int64 
 5   field_goals_attempted     741569 non-null  int64 
 6   three_pointers            741569 non-null  int64 
 7   three_pointers_attempted  741569 non-null  int64 
 8   free_throws               741569 non-null  int64 
 9   free_throws_attempted     741569 non-null  int64 
 10  total_rebounds            741569 non-null  int64 
 11  assists                   741569 non-null  int64 
 12  points                    741569 non-null  int64 
 13  is_starter                741569 non-null  int64 
dtypes: i

> Columns have been renamed for better understanding.

In [94]:
# Check for duplicated rows
df.duplicated().sum()

np.int64(0)

> There are no duplicated rows in the dataset.

In [95]:
# Trim leading and trailing whitespaces in column names
df.columns =df.columns.str.strip()

# Trim leading and trailing whitespaces in rows
df[df.select_dtypes(include="string").columns] = (
    df.select_dtypes(include="string").apply(lambda x: x.str.strip())
)

df.head()

Unnamed: 0,game_id,team_name,player_name,minutes_played,field_goals,field_goals_attempted,three_pointers,three_pointers_attempted,free_throws,free_throws_attempted,total_rebounds,assists,points,is_starter
0,1,Sacramento Kings,Corliss Williamson,37:20,7,11,0,0,0,0,4,4,14,1
1,1,Sacramento Kings,Mitch Richmond,32:00,6,12,1,4,1,1,5,3,14,1
2,1,Sacramento Kings,Olden Polynice,31:34,0,4,0,0,1,4,7,3,1,1
3,1,Sacramento Kings,Mahmoud Abdul-Rauf,29:27,7,13,1,2,2,2,2,5,17,1
4,1,Sacramento Kings,Brian Grant,25:13,3,11,0,0,2,2,6,0,8,1


In [122]:
# Add a new column which calculates the field goals percentage
df["field_goals_percentage_%"] = (
    df["field_goals"] / df["field_goals_attempted"] * 100
).round(2)

# Change NaN values to 0
df.loc[df["field_goals_attempted"] == 0, "field_goals_percentage_%"] = 0

df.head()


Unnamed: 0,game_id,team_name,player_name,minutes_played,field_goals,field_goals_attempted,three_pointers,three_pointers_attempted,free_throws,free_throws_attempted,total_rebounds,assists,points,is_starter,field_goals_percentage_%
0,1,Sacramento Kings,Corliss Williamson,37:20,7,11,0,0,0,0,4,4,14,1,63.64
1,1,Sacramento Kings,Mitch Richmond,32:00,6,12,1,4,1,1,5,3,14,1,50.0
2,1,Sacramento Kings,Olden Polynice,31:34,0,4,0,0,1,4,7,3,1,1,0.0
3,1,Sacramento Kings,Mahmoud Abdul-Rauf,29:27,7,13,1,2,2,2,2,5,17,1,53.85
4,1,Sacramento Kings,Brian Grant,25:13,3,11,0,0,2,2,6,0,8,1,27.27


> Added a new column which calculates the field goal percentage.

In [123]:
# Add a new column which calculates the three pointers percentage
df["three_point_percentage_%"] = (
    df["three_pointers"] / df["three_pointers_attempted"] * 100
).round(2)

# Change NaN values to 0
df.loc[df["three_pointers_attempted"] == 0, "three_point_percentage_%"] = 0

df.head()

Unnamed: 0,game_id,team_name,player_name,minutes_played,field_goals,field_goals_attempted,three_pointers,three_pointers_attempted,free_throws,free_throws_attempted,total_rebounds,assists,points,is_starter,field_goals_percentage_%,three_point_percentage_%
0,1,Sacramento Kings,Corliss Williamson,37:20,7,11,0,0,0,0,4,4,14,1,63.64,0.0
1,1,Sacramento Kings,Mitch Richmond,32:00,6,12,1,4,1,1,5,3,14,1,50.0,25.0
2,1,Sacramento Kings,Olden Polynice,31:34,0,4,0,0,1,4,7,3,1,1,0.0,0.0
3,1,Sacramento Kings,Mahmoud Abdul-Rauf,29:27,7,13,1,2,2,2,2,5,17,1,53.85,50.0
4,1,Sacramento Kings,Brian Grant,25:13,3,11,0,0,2,2,6,0,8,1,27.27,0.0


> Added a new column which calculates the three point percentage