# Data Cleaning

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from matplotlib import cm
#Needed otherwiesep plots won't be shown
%matplotlib inline

In [2]:
df = pd.read_csv('salary_error.csv') #load the salary dataset into your jupyter notebook
df.head() 

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba,<=50K


In [3]:
#check again how many data we had at the beginning = 32561
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20.0,United-States,<=50K


In [4]:
# Check for any missing values in the dataframe
df.isnull().sum() 

age               0
workclass         1
fnlwgt            0
education         1
education-num     0
marital-status    0
occupation        1
relationship      1
race              1
sex               0
capital-gain      0
capital-loss      0
hours-per-week    1
native-country    1
salary            0
dtype: int64

In [5]:
df.values

array([[39, ' State-gov', 77516, ..., 40.0, ' United-States', ' <=50K'],
       [50, ' Self-emp-not-inc', 83311, ..., 13.0, ' United-States',
        ' <=50K'],
       [38, ' Private', 215646, ..., 40.0, ' United-States', ' <=50K'],
       ...,
       [58, ' Private', 151910, ..., 40.0, ' United-States', ' <=50K'],
       [22, ' Private', 201490, ..., 20.0, ' United-States', ' <=50K'],
       [52, ' Self-emp-inc', 287927, ..., 40.0, ' United-States',
        ' >50K']], dtype=object)

In [6]:
sum(df.isnull().any(axis=1))

7

In [7]:
#we want to know where the missing values are
df[df.isnull().any(1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52.0,,<=50K
44,25,Private,289980,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,,Male,0,0,35.0,United-States,<=50K
136,43,Self-emp-not-inc,56920,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,60.0,United-States,<=50K
219,25,,255004,10th,6,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
477,22,Private,281432,Some-college,10,Never-married,Handlers-cleaners,,White,Male,0,0,30.0,United-States,<=50K
625,30,Private,124187,,9,Never-married,Farming-fishing,Own-child,Black,Male,0,0,60.0,United-States,<=50K
16232,18,Private,145005,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,,United-States,<=50K


In [8]:
#try to estimate the value of the 'hours-per-week' attribute via the mean
# fill all hours-per-week missing values with mean
x=df["hours-per-week"].mean()
df["hours-per-week"].fillna(x, inplace = True)
print(x)

40.43808353808354


In [9]:
#we want to know where the missing values now are and if 'hours-per-week' was estimated
df[df.isnull().any(1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52.0,,<=50K
44,25,Private,289980,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,,Male,0,0,35.0,United-States,<=50K
136,43,Self-emp-not-inc,56920,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,60.0,United-States,<=50K
219,25,,255004,10th,6,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
477,22,Private,281432,Some-college,10,Never-married,Handlers-cleaners,,White,Male,0,0,30.0,United-States,<=50K
625,30,Private,124187,,9,Never-married,Farming-fishing,Own-child,Black,Male,0,0,60.0,United-States,<=50K


In [11]:
#Further possibilites of cleaning data

#If the value is higher than 120, set it to 120:

#for x in df.index:
#  if df.loc[x, "attribute"] > 120:
#    df.loc[x, "attribute"] = 120

#Delete rows where "Duration" is higher than 120:

#for x in df.index:
#  if df.loc[x, "attribute"] > 120:
#    df.drop(x, inplace = True)

In [12]:
#check if duplicates exist, no duplicates
print(df.duplicated())

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557    False
32558    False
32559    False
32560    False
Length: 32561, dtype: bool


In [13]:
# remove the rest of the rows that contain missing values, 6 rows were deleted
# store cleaned data
df_new=df.dropna(axis=0)
df_new

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20.0,United-States,<=50K


In [14]:
#we dont know what fnlwght stand for, drop column
df_new=df_new.drop(columns=['fnlwgt'])
df_new

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13.0,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40.0,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20.0,United-States,<=50K


In [15]:
#rename column name (sex to gender) 
df_new = df_new.rename(columns={'sex': 'gender'})
df_new

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13.0,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40.0,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40.0,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38.0,United-States,<=50K
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40.0,United-States,>50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40.0,United-States,<=50K
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20.0,United-States,<=50K


In [17]:
#write cleaned data as csv
df_new.to_csv('salary_cleaned.csv')