In [25]:
# importing python module

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
# Loading dataset

df = pd.read_csv('nba.csv')

In [27]:
# Displaying the dataset
# The first five rows

df.head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [28]:
# Display the last five row

df.tail(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [9]:
# Getting the number of rows and columns

df.shape

(458, 9)

In [10]:
# Displaying the column names

df.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [11]:
# Displaying the row's range

df.index

RangeIndex(start=0, stop=458, step=1)

In [13]:
# Displaying dataset values
df.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [14]:
# Getting general information about our dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


From the general information above we can see that our dataset has issues including

- Missing data
- Inconsistencies in data types

## Handling Missing Values


In [36]:
# Checking for missing values
# for each column

df.isnull().sum()

Name        0
Team        0
Number      0
Position    0
Age         0
Height      0
Weight      0
College     0
Salary      0
dtype: int64

In [38]:
# Use 'Unknown' as a placeholder for missing Name
df['Name'].fillna('Unknown', inplace = True)

# Use 'Unknown' as a placeholder for missing Team
df['Team'].fillna('Unknown', inplace = True)

# Use -1 as a placeholder for missing number
df['Number'].fillna(0, inplace = True) 

# Handling missing value in 'Position' colum
df['Position'].fillna('Unknown', inplace = True)

# Use -1 as placeholder for missing age
df['Age'].fillna(-1, inplace = True)

# Use '0-0' as placeholder for missing height
df['Height'].fillna('0-0', inplace = True)

# Use -1 as a placeholder for missing weight.
df['Weight'].fillna(-1, inplace=True) 

# Use 'Unkown' as a placeholder for missing College
df['College'].fillna('Unknown', inplace=True)

# Use 0 as a placeholder for missing salary.
df['Salary'].fillna(0, inplace=True) 

In [39]:
 # Now let's recheck the missing values
df.isnull().sum()

Name        0
Team        0
Number      0
Position    0
Age         0
Height      0
Weight      0
College     0
Salary      0
dtype: int64

In [40]:
# Data has no missing values

## Handling Data Inconsistency

 - 'Number', 'Age' columns should contain integer type not float.
 
 
 - 'Salary' column should contain interger or as currency values(eg, using comma to separate thousands)
 
 
 - The 'Height' and 'Weight' columns have inconsistent formats. Height is given in feet and inches (e.g., "6-2"), which would be better represented as separate columns for feet and inches, or as a single column in a consistent format.

In [52]:
# Convert 'Number' and 'Age' columns to integers.
df['Number'] = df['Number'].astype(int)
df['Age'] = df['Age'].astype(int)

In [53]:
# Checking the data type of 'Number' and 'Age'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      458 non-null    object 
 1   Team      458 non-null    object 
 2   Number    458 non-null    int64  
 3   Position  458 non-null    object 
 4   Age       458 non-null    int64  
 5   Weight    458 non-null    float64
 6   College   458 non-null    object 
 7   Salary    458 non-null    object 
 8   Feet      458 non-null    int64  
 9   Inches    458 non-null    int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 35.9+ KB


In [54]:
#Parse 'Height' into separate 'Feet' and 'Inches' columns (create placeholders for missing height).
df['Feet'], df['Inches'] = zip(*df['Height'].str.split('-', 1).apply(lambda x: (int(x[0]), int(x[1]) if len(x) == 2 else 0)))df.drop('Height', axis=1, inplace=True)

In [55]:
# Convert 'Salary' to currency format (with commas for thousands).
df['Salary'] = df['Salary'].map('${:,.0f}'.format)

In [46]:
df

Unnamed: 0,Name,Team,Number,Position,Age,Weight,College,Salary,Feet,Inches
0,Avery Bradley,Boston Celtics,0,PG,25,180.0,Texas,"$7,730,337",6,2
1,Jae Crowder,Boston Celtics,99,SF,25,235.0,Marquette,"$6,796,117",6,6
2,John Holland,Boston Celtics,30,SG,27,205.0,Boston University,$0,6,5
3,R.J. Hunter,Boston Celtics,28,SG,22,185.0,Georgia State,"$1,148,640",6,5
4,Jonas Jerebko,Boston Celtics,8,PF,29,231.0,Unknown,"$5,000,000",6,10
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8,PG,26,203.0,Butler,"$2,433,333",6,3
454,Raul Neto,Utah Jazz,25,PG,24,179.0,Unknown,"$900,000",6,1
455,Tibor Pleiss,Utah Jazz,21,C,26,256.0,Unknown,"$2,900,000",7,3
456,Jeff Withey,Utah Jazz,24,C,26,231.0,Kansas,"$947,276",7,0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      458 non-null    object 
 1   Team      458 non-null    object 
 2   Number    458 non-null    int64  
 3   Position  458 non-null    object 
 4   Age       458 non-null    int64  
 5   Weight    458 non-null    float64
 6   College   458 non-null    object 
 7   Salary    458 non-null    object 
 8   Feet      458 non-null    int64  
 9   Inches    458 non-null    int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 35.9+ KB


## Checking for Duplicates

In [60]:
df.duplicated().sum()

0