# EDA for Salaries

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [2]:
df = pd.read_csv("../data/raw/salaries.csv")

df.head()

Unnamed: 0,playerName,seasonStartYear,salary,inflationAdjSalary
0,Michael Jordan,1996,"$30,140,000","$52,258,566"
1,Horace Grant,1996,"$14,857,000","$25,759,971"
2,Reggie Miller,1996,"$11,250,000","$19,505,934"
3,Shaquille O'Neal,1996,"$10,714,000","$18,576,585"
4,Gary Payton,1996,"$10,212,000","$17,706,187"


In [3]:
dict_for_renaming_columns = {
    "playerName": "player_name ",
    "seasonStartYear": "season_start_year ",
    "salary": "salary ",
    "inflationAdjSalary": "inflation_adjusted_salary",
}

df = df.rename(columns=dict_for_renaming_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11583 entries, 0 to 11582
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   player_name                11583 non-null  object
 1   season_start_year          11583 non-null  int64 
 2   salary                     11583 non-null  object
 3   inflation_adjusted_salary  11583 non-null  object
dtypes: int64(1), object(3)
memory usage: 362.1+ KB


> Columns have been renamed for consistency. Some column names have been intentionally messed up to add more cleaning.

In [4]:
df.duplicated().sum()

np.int64(0)

> No duplicates in the dataset.

In [5]:
# Trim leading and trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Trim leading and trailing whitespaces in rows
df[df.select_dtypes(include="object").columns] = (
    df.select_dtypes(include="object").apply(lambda x: x.str.strip())
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11583 entries, 0 to 11582
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   player_name                11583 non-null  object
 1   season_start_year          11583 non-null  int64 
 2   salary                     11583 non-null  object
 3   inflation_adjusted_salary  11583 non-null  object
dtypes: int64(1), object(3)
memory usage: 362.1+ KB


> All leading and trailing whitesppaces have been removed from the dataset

In [6]:
columns_to_remove_dollar_sign = ["salary", "inflation_adjusted_salary"]
df[columns_to_remove_dollar_sign] = df[columns_to_remove_dollar_sign].apply(lambda col: col.str.replace("$", "", regex=False))

df.head()

Unnamed: 0,player_name,season_start_year,salary,inflation_adjusted_salary
0,Michael Jordan,1996,30140000,52258566
1,Horace Grant,1996,14857000,25759971
2,Reggie Miller,1996,11250000,19505934
3,Shaquille O'Neal,1996,10714000,18576585
4,Gary Payton,1996,10212000,17706187


> Removed the dollar sign ($) from both salary columns.

In [7]:
df["season_start_year"] = pd.to_numeric(df["season_start_year"], errors="coerce")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11583 entries, 0 to 11582
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   player_name                11583 non-null  object
 1   season_start_year          11583 non-null  int64 
 2   salary                     11583 non-null  object
 3   inflation_adjusted_salary  11583 non-null  object
dtypes: int64(1), object(3)
memory usage: 362.1+ KB


> Converted the `season_start_year` to numeric for filtering.

In [8]:
# Remove any special characters (keep only letters, numbers or spaces)
df["player_name"] = df["player_name"].str.replace(r"[^a-zA-Z0-9\s\-']", "", regex=True)

df.head()

Unnamed: 0,player_name,season_start_year,salary,inflation_adjusted_salary
0,Michael Jordan,1996,30140000,52258566
1,Horace Grant,1996,14857000,25759971
2,Reggie Miller,1996,11250000,19505934
3,Shaquille O'Neal,1996,10714000,18576585
4,Gary Payton,1996,10212000,17706187


> Removed special characters from player names.

In [9]:
# Filter only dates between 2015 and 2019
df_filtered = df[(df["season_start_year"] >= 2015) & (df["season_start_year"] <= 2019)]

df_filtered.head(10)

Unnamed: 0,player_name,season_start_year,salary,inflation_adjusted_salary
8861,Kobe Bryant,2015,25000000,28463195
8862,LeBron James,2015,22970500,26152553
8863,Carmelo Anthony,2015,22875000,26043823
8864,Dwight Howard,2015,22359364,25456757
8865,Joe Johnson,2015,22309344,25399808
8866,Chris Bosh,2015,22192730,25267040
8867,Kevin Durant,2015,21971850,25015562
8868,Chris Paul,2015,21468695,24442706
8869,Derrick Rose,2015,20093064,22876512
8870,Dwyane Wade,2015,20000000,22770556


> Filtered out salaries between 2015 and 2019.

In [10]:
# Save the transformed dataset to a CSV file
FILE_PATH = "../tests/test_data/test_cleaned_salaries.csv"
df_filtered.to_csv(FILE_PATH, index=False)