In [305]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
%matplotlib inline

In [306]:
data = pd.read_csv('./dataset/income.csv')
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K
0,66,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [307]:
## Check that there are no missing values
data.isnull().sum()

age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income >50K          0
dtype: int64

## Data processing
Going through our data we noticed that it had some empty values in the workclass and occupation columns. Given the size of our dataset we feel safe removing them

In [308]:
# Replace all NaN in workclass, occupation and native-country to Unemployed
data.dropna(subset = ["workclass", "occupation", "native-country"], axis = 0, inplace = True)
data.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income >50K       0
dtype: int64

## Discretization
There are 4 non-discrete columns: education-num, capital-gains, capital-loss, hours-per-week. 

Firstly, we will be dropping the education-num column as it is just a coded version of the education column.

capital-gains: <5000, 5000-10000, >10000  
capital-loss: this feature doesn't seem to distinguish anything so we will yeet it.  
hours-per-week: <20, 20-40, 40-60, >60  


In [309]:
# Start by dropping columns we don't need
data.drop(['education-num'], axis=1, inplace=True)

# Now we need to discretized our values
ageBins = (0, 18, 25, 35, 45, 55, 65, 999)
ageLabels = ['<18', '18-25', '25-35', '35-45', '45-55', '55-65', '>65']

## set our lower bound to -1 to include 0 in the first bin
capitalGainBins = (-1, 5000, 10000, 99999999)
capitalGainLabels = ['<5000', '5000-10000', '>10000']

hoursPerWeekBins = (-1, 20, 30, 40, 50, 60, 999999)
hoursPerWeekLabels = ['<20', '20-30', '30-40', '40-50', '40-60', '>60']

capitalLossBins = (-1, 5000, 10000, 99999999)
capitalLossLabels = ['<5000', '5000-10000', '>10000']


data['age'] = pd.cut(data['age'], bins=ageBins, labels=ageLabels)
data['capital-gain'] = pd.cut(data['capital-gain'], bins=capitalGainBins, labels=capitalGainLabels)
data['hours-per-week'] = pd.cut(data['hours-per-week'], bins=hoursPerWeekBins, labels=hoursPerWeekLabels)
data['capital-loss'] = pd.cut(data['capital-loss'], bins=capitalLossBins, labels=capitalLossLabels)


Now that our preprocessing is done, we are ready to classify

In [310]:
encode = LabelEncoder()

data['capital-gain'] = encode.fit_transform(data['capital-gain'])
data['workclass'] = encode.fit_transform(data['workclass'])
data['education'] = encode.fit_transform(data['education'])
data['occupation'] = encode.fit_transform(data['occupation'])
data['race'] = encode.fit_transform(data['race'])
data['sex'] = encode.fit_transform(data['sex'])
data['native-country'] = encode.fit_transform(data['native-country'])
data['marital-status'] = encode.fit_transform(data['marital-status'])
data['relationship'] = encode.fit_transform(data['relationship'])
data['hours-per-week'] = encode.fit_transform(data['hours-per-week'])
data['capital-loss'] = encode.fit_transform(data['capital-loss'])
data['capital-gain'] = encode.fit_transform(data['capital-gain'])

In [312]:
X = data.drop('income >50K', axis=1)
y = data['income >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))


Number of mislabeled points out of a total 9049 points : 1896
