# EDA for Player Information

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [2]:
df = pd.read_csv("../data/raw/player_info.csv")

df.head(5)

Unnamed: 0,playerName,From,To,Pos,Ht,Wt,birthDate,Colleges
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


In [3]:
columns_to_drop = ["Colleges", "From", "To"]

df = df.drop(columns=columns_to_drop, axis="columns")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5018 entries, 0 to 5017
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   playerName  5018 non-null   object 
 1   Pos         5018 non-null   object 
 2   Ht          5018 non-null   object 
 3   Wt          5013 non-null   float64
 4   birthDate   5000 non-null   object 
dtypes: float64(1), object(4)
memory usage: 196.1+ KB


> Dropped unnecessary columns.

In [4]:
dict_for_renaming_columns = {
    "playerName": "player_name",
    "From": "from ",
    "To": "to ",
    "Pos": "position",
    "Ht": "height",
    "Wt": "weight",
    "birthDate": "birth_date",
    "Colleges": "colleges"
}

df = df.rename(columns=dict_for_renaming_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5018 entries, 0 to 5017
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_name  5018 non-null   object 
 1   position     5018 non-null   object 
 2   height       5018 non-null   object 
 3   weight       5013 non-null   float64
 4   birth_date   5000 non-null   object 
dtypes: float64(1), object(4)
memory usage: 196.1+ KB


> Column names have been changed for consistency.

In [5]:
df.duplicated().sum()

np.int64(0)

There are some null values in the `weight` and `birth_date` columns. These are **0.1%** and **0.4%** respectively of the missing data, so they can be safely removed from the DataFrame.

In [6]:
# Remove rows with null values from the weight column
df = df.dropna(subset=["weight"])

# remove rows with null values from the height column
df = df.dropna(subset=["birth_date"])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4996 entries, 0 to 5017
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_name  4996 non-null   object 
 1   position     4996 non-null   object 
 2   height       4996 non-null   object 
 3   weight       4996 non-null   float64
 4   birth_date   4996 non-null   object 
dtypes: float64(1), object(4)
memory usage: 234.2+ KB


> Missing values have been removed from the DataFrame.

In [7]:
# Trim leading and trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Trim leading and trailing whitespaces in rows
df[df.select_dtypes(include="string").columns] = (
    df.select_dtypes(include="string").apply(lambda x: x.str.strip())
)

df.head()

Unnamed: 0,player_name,position,height,weight,birth_date
0,Alaa Abdelnaby,F-C,6-10,240.0,"June 24, 1968"
1,Zaid Abdul-Aziz,C-F,6-9,235.0,"April 7, 1946"
2,Kareem Abdul-Jabbar*,C,7-2,225.0,"April 16, 1947"
3,Mahmoud Abdul-Rauf,G,6-1,162.0,"March 9, 1969"
4,Tariq Abdul-Wahad,F,6-6,223.0,"November 3, 1974"


> All leading and trailing whitespaces have been removed from the DataFrame.

In [8]:
# Find the unique positions so they can be changed for better understanding
unique_values = df["position"].unique()

unique_values

array(['F-C', 'C-F', 'C', 'G', 'F', 'G-F', 'F-G'], dtype=object)

In [9]:
# Create a map to change the values in the position column
mapping_dict = {
    "F-C": "Forward, Center",
    "C-F": "Center, Forward",
    "C": "Center",
    "G": "Guard",
    "F": "Forward",
    "G-F": "Guard, Forward",
    "F-G": "Forward, Guard"
}

df["position"] = df["position"].replace(mapping_dict)

df.head()

Unnamed: 0,player_name,position,height,weight,birth_date
0,Alaa Abdelnaby,"Forward, Center",6-10,240.0,"June 24, 1968"
1,Zaid Abdul-Aziz,"Center, Forward",6-9,235.0,"April 7, 1946"
2,Kareem Abdul-Jabbar*,Center,7-2,225.0,"April 16, 1947"
3,Mahmoud Abdul-Rauf,Guard,6-1,162.0,"March 9, 1969"
4,Tariq Abdul-Wahad,Forward,6-6,223.0,"November 3, 1974"


> Position column values have been changed.

In [10]:
# Add a new column which calculates the player's weight in kilograms(kg)
df["weight_kg"] = (
    df["weight"] * 0.45359237
).round(1)

df.head()

Unnamed: 0,player_name,position,height,weight,birth_date,weight_kg
0,Alaa Abdelnaby,"Forward, Center",6-10,240.0,"June 24, 1968",108.9
1,Zaid Abdul-Aziz,"Center, Forward",6-9,235.0,"April 7, 1946",106.6
2,Kareem Abdul-Jabbar*,Center,7-2,225.0,"April 16, 1947",102.1
3,Mahmoud Abdul-Rauf,Guard,6-1,162.0,"March 9, 1969",73.5
4,Tariq Abdul-Wahad,Forward,6-6,223.0,"November 3, 1974",101.2


> New column representing player's weight in kilograms(kg).

In [11]:
# Remove any special characters (keep only letters, numbers or spaces)
df["player_name"] = df["player_name"].str.replace(r"[^a-zA-Z0-9\s\-']", "", regex=True)

df.head()

Unnamed: 0,player_name,position,height,weight,birth_date,weight_kg
0,Alaa Abdelnaby,"Forward, Center",6-10,240.0,"June 24, 1968",108.9
1,Zaid Abdul-Aziz,"Center, Forward",6-9,235.0,"April 7, 1946",106.6
2,Kareem Abdul-Jabbar,Center,7-2,225.0,"April 16, 1947",102.1
3,Mahmoud Abdul-Rauf,Guard,6-1,162.0,"March 9, 1969",73.5
4,Tariq Abdul-Wahad,Forward,6-6,223.0,"November 3, 1974",101.2


> Removed special characters from player names.

In [12]:
# Function to convert "feet-inches" to metres in height column
def height_to_metres(height_string):
    try:
        # Save the feet and inches values in a list
        feet, inches = map(int, height_string.split("-"))
        # Find the total value of inches
        total_inches = feet * 12 + inches
        # Convert to metres rounded to 2 decimal places
        return round(total_inches * 0.0254, 2)
    except Exception:
        return None

# Create the new column
df["height_m"] = df["height"].apply(height_to_metres)

df.head()

Unnamed: 0,player_name,position,height,weight,birth_date,weight_kg,height_m
0,Alaa Abdelnaby,"Forward, Center",6-10,240.0,"June 24, 1968",108.9,2.08
1,Zaid Abdul-Aziz,"Center, Forward",6-9,235.0,"April 7, 1946",106.6,2.06
2,Kareem Abdul-Jabbar,Center,7-2,225.0,"April 16, 1947",102.1,2.18
3,Mahmoud Abdul-Rauf,Guard,6-1,162.0,"March 9, 1969",73.5,1.85
4,Tariq Abdul-Wahad,Forward,6-6,223.0,"November 3, 1974",101.2,1.98


> Added a new column to represent the height in metres.

In [13]:
FILE_PATH = "../tests/test_data/test_cleaned_playerinfo.csv"
df.to_csv(FILE_PATH, index=False)