In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv('adult_csv.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age               48842 non-null int64
workclass         46043 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        46033 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capitalgain       48842 non-null int64
capitalloss       48842 non-null int64
hoursperweek      48842 non-null int64
native-country    47985 non-null object
class             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
# take a look at the outcome/independent variable 'income'
df['class'].value_counts()

<=50K    37155
>50K     11687
Name: class, dtype: int64

In [6]:
df['class'] = [0 if x == '<=50k' else 1 for x in df['class']]

In [18]:
X = df.drop('class',axis=1)
y= df['class']

In [19]:
X.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capitalgain          0
capitalloss          0
hoursperweek         0
native-country     857
dtype: int64

## Basic Data cleaning

#### 1. Convert categorical features into numerical

In [8]:
# Decide which categorical variables you want to use in the model
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Feature '{col_name}' has '{unique_cat}' unique categories".format(col_name=col_name,unique_cat=unique_cat))

Feature 'workclass' has '9' unique categories
Feature 'education' has '16' unique categories
Feature 'marital-status' has '7' unique categories
Feature 'occupation' has '15' unique categories
Feature 'relationship' has '6' unique categories
Feature 'race' has '5' unique categories
Feature 'sex' has '2' unique categories
Feature 'native-country' has '42' unique categories


In [9]:
# Although native country has a lot ot of unique categories most categories have few observations
print(X['native-country'].value_counts().sort_values(ascending=False).head())

United-States    43832
Mexico             951
Philippines        295
Germany            206
Puerto-Rico        184
Name: native-country, dtype: int64


In [10]:
# In this case, bucket low frequency categories as 'other'
X['native-country'] = ['United-States' if x=='United-States' else 'Other' for x in X['native-country']]
X['native-country'].value_counts()

United-States    43832
Other             5010
Name: native-country, dtype: int64

In [11]:
dummy_list = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']

In [20]:
# Function to dummy all the categorical variables used for modeling
def dummy_df(df,dummy_list):
    for x in dummy_list:
        dummies = pd.get_dummies(df[x],prefix=x,dummy_na=True)
        df=df.drop(x, 1)
        df = pd.concat([df, dummies],axis=1)
    return df

In [21]:
X=dummy_df(X, dummy_list)

In [22]:
X.head()

Unnamed: 0,age,fnlwgt,education-num,capitalgain,capitalloss,hoursperweek,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_nan
0,2,77516,13,1,0,2,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3,83311,13,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2,215646,9,0,0,2,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,3,234721,7,0,0,2,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,1,338409,13,0,0,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Handling missing data

In [26]:
# How much of your data is missing?
X.isnull().sum().sort_values(ascending=False).head()

native-country_nan        0
relationship_Husband      0
education_Prof-school     0
education_Some-college    0
education_nan             0
dtype: int64

In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Columns: 113 entries, age to native-country_nan
dtypes: int64(6), uint8(107)
memory usage: 7.2 MB
