In [1]:
# import libraries
import os
import numpy as np
import pandas as pd

In [4]:
# import data and take a look
df = pd.read_csv('adult.csv', na_values = ["?"])
#print(df.head())
#print(df.info())
print(df.tail())

       age     workclass  fnlwgt   education  educational-num  \
48837   27       Private  257302  Assoc-acdm               12   
48838   40       Private  154374     HS-grad                9   
48839   58       Private  151910     HS-grad                9   
48840   22       Private  201490     HS-grad                9   
48841   52  Self-emp-inc  287927     HS-grad                9   

           marital-status         occupation relationship   race  gender  \
48837  Married-civ-spouse       Tech-support         Wife  White  Female   
48838  Married-civ-spouse  Machine-op-inspct      Husband  White    Male   
48839             Widowed       Adm-clerical    Unmarried  White  Female   
48840       Never-married       Adm-clerical    Own-child  White    Male   
48841  Married-civ-spouse    Exec-managerial         Wife  White  Female   

       capital-gain  capital-loss  hours-per-week native-country income  
48837             0             0              38  United-States  <=50K  
4883

In [5]:
#take a look at outcome variable
print(df['income'].value_counts())

<=50K    37155
>50K     11687
Name: income, dtype: int64


In [6]:
#Assign outcome as 0 if income <= 50K and 1 if income > 50K
#also on rhs in square brackets it is a lambda function
df['income'] = [0 if x== '<=50K' else 1 for x in df['income']]

In [7]:
#Assign X as a dataframe of features and Y as a Series of the outcome variable
X = df.drop('income',1)
Y = df.income

In [8]:
print(X.head())

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18        NaN  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                NaN    Own-child  White  Female             0             0   

   hours-per-week native-country  
0  

In [9]:
print(Y.head())

0    <=50K
1    <=50K
2     >50K
3     >50K
4    <=50K
Name: income, dtype: object


Data Cleaning
Dealing with Datatypes
- 3 Main Datatypes
-----Numeric, Eg. income, age etc.
-----Catagorical, Eg. Gender, Nationality etc.
-----Ordinal, Eg. Low/Medium/High
-Models can only handles numeric features.
-Must convert categorical and ordinal features into numeric features

In [10]:
#Education is a catagorical features:
print(X['education'].head())

0            11th
1         HS-grad
2      Assoc-acdm
3    Some-college
4    Some-college
Name: education, dtype: object


In [15]:
# Use get dummies in pandas
# Another option: OneHotEncoder in sci-kit learn
print(pd.get_dummies(X['education'].head()))

   11th  Assoc-acdm  HS-grad  Some-college
0     1           0        0             0
1     0           0        1             0
2     0           1        0             0
3     0           0        0             1
4     0           0        0             1


In [18]:
# Decide which categorical Variable you want to use in model.

for col_name in X.columns:
    if X[col_name].dtypes == 'Object':
        unique_cat = len(X[col_name.names].unique())
        print("Feature '{col_name}' has '{unique_cat}'unique categories".format(col_name = col_name, unique_cat = unique_cat))

TypeError: data type "Object" not understood

In [19]:
#Although 'native_country' has a lot of unique categories , most categories only have a few observations
print(X['native-country'].value_counts().sort_values(ascending = False).head())

United-States    43832
Mexico             951
Philippines        295
Germany            206
Puerto-Rico        184
Name: native-country, dtype: int64


In [24]:
X['native-country'] = ['United-States' if x == 'United-States' else 'Other' for x in X['native-country']]

In [25]:
print(X['native-country'].value_counts().sort_values(ascending = False))

United-States    43832
Other             5010
Name: native-country, dtype: int64
