In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

## Part 1 (Preprocessing)

In [72]:
df = pd.read_csv("adult_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,workClass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,36,State-gov,77516,Bachelors,1,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,74,United-States,<=50K
1,1,54,Self-emp-not-inc,83311,Bachelors,25,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,39,United-States,<=50K
2,2,26,Private,215646,HS-grad,3,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,71,United-States,<=50K
3,3,71,Private,234721,11th,9,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,10,United-States,<=50K
4,4,33,Private,338409,Bachelors,20,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,75,Cuba,<=50K


- Convert income values to 0 or 1
- Drop columns which are redundant or do not contribute much to the prediction

In [73]:
print(df.income.unique()) #See what types of income we have; <=50K, >50K, <=50K., >50K.
df.income = df.income.replace(to_replace=['<=50K', '>50K', '<=50K.', '>50K.'], value=[0,1,0,1])
df.drop(['fnlwgt', 'education', 'capital-gain','capital-loss', 'native-country'], axis=1, inplace=True)

['<=50K' '>50K' '<=50K.' '>50K.']


- Replace missing values with the most frequent occuring value of that column 

In [74]:
print(df.isnull().sum(axis=0)) #See which columns contain missing values; workClass and occupation 
print("Values of workClass column", df.workClass.unique())
print("\nValues of occupation column", df.occupation.unique())
df.workClass.fillna(df.workClass.value_counts().idxmax(),inplace=True) #Fill missing values with most frequent one
df.occupation.fillna(df.occupation.value_counts().idxmax(),inplace=True)
df.isnull().sum(axis=0) #No null values now

Unnamed: 0           0
age                  0
workClass         2799
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
hours-per-week       0
income               0
dtype: int64
Values of workClass column ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked']

Values of occupation column ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' nan
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']


Unnamed: 0        0
age               0
workClass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
hours-per-week    0
income            0
dtype: int64

- Transfer the categorical values to numerical values through one-hot encoding

In [75]:
df = pd.get_dummies(df,["workClass", "marital-status	", "occupation", "relationship", "race", "sex"])

- Split the dataset to train and test (70/30)

In [76]:
Train_X, Test_X, Train_y, Test_y = train_test_split(df.drop(columns=['income']), df.income, test_size=0.3)
print(Train_X.shape, Train_y.shape)
print(Test_X.shape, Test_y.shape)

(34189, 46) (34189,)
(14653, 46) (14653,)


## Part 2. Classification with K-Nearest Neighbour Classifier (KNN)

In [77]:
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
knncl = KNeighborsClassifier()
knncl.fit(Train_X, Train_y)
knn_y_pred = knncl.predict(Test_X)
knn_acc = accuracy_score(Test_y, knn_y_pred)

## Part 3. Classification with Decision Tree Classifier

In [78]:
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
dtcl = DecisionTreeClassifier()
dtcl.fit(Train_X, Train_y)
dtcl_y_pred = dtcl.predict(Test_X)
dtcl_acc = accuracy_score(Test_y, dtcl_y_pred)

Compare the accuracy results from the KNN and Decision Tree classifiers.

In [79]:
print("KNN accuracy ", knn_acc)
print("KNN confusion matrix: \n", confusion_matrix(Test_y, y_pred), "\n")
print("Decision Tree accuracy ", dtcl_acc)
print("Decision tree confusion matrix: \n", confusion_matrix(Test_y, y_pred))

KNN accuracy  0.7128915580427216
KNN confusion matrix: 
 [[8449 2724]
 [2586  894]] 

Decision Tree accuracy  0.7385518323892718
Decision tree confusion matrix: 
 [[8449 2724]
 [2586  894]]
