In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score, r2_score,f1_score, mean_squared_error as mse

In [2]:
df2 = pd.read_csv("data/adult.csv")
df2.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [3]:
df2['income'].replace(['<=50K','>50K'],[1,0],inplace=True)
df2['sex'].replace(['Female','Male'],[1,0],inplace=True)
df2

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,1,0,4356,40,United-States,1
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,1,0,4356,18,United-States,1
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,1,0,4356,40,United-States,1
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,1,0,3900,40,United-States,1
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,1,0,3900,40,United-States,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,0,0,0,40,United-States,1
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,1,0,0,38,United-States,1
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,0,0,0,40,United-States,0
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,1,0,0,40,United-States,1


In [4]:
cols = ['workclass','education','marital.status','occupation','relationship','native.country','race']
df2 = df2.drop(cols, axis=1)
df2

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,income
0,90,77053,9,1,0,4356,40,1
1,82,132870,9,1,0,4356,18,1
2,66,186061,10,1,0,4356,40,1
3,54,140359,4,1,0,3900,40,1
4,41,264663,10,1,0,3900,40,1
...,...,...,...,...,...,...,...,...
32556,22,310152,10,0,0,0,40,1
32557,27,257302,12,1,0,0,38,1
32558,40,154374,9,0,0,0,40,0
32559,58,151910,9,1,0,0,40,1


In [5]:
df2x = df2.iloc[:, :-1].values
df2x

array([[    90,  77053,      9, ...,      0,   4356,     40],
       [    82, 132870,      9, ...,      0,   4356,     18],
       [    66, 186061,     10, ...,      0,   4356,     40],
       ...,
       [    40, 154374,      9, ...,      0,      0,     40],
       [    58, 151910,      9, ...,      0,      0,     40],
       [    22, 201490,      9, ...,      0,      0,     20]], dtype=int64)

In [6]:
df2y = df2.iloc[:,-1].values
df2y

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df2x, df2y, test_size=0.3,random_state=5)

In [8]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC()

In [9]:
yhat = clf.predict(X_test)
yhat [0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [10]:
r2_score(yhat, y_test)

-4.803812988612246

In [11]:
f1_score(y_test, yhat, average='weighted')

0.7312101402113644

In [12]:
mse(yhat, y_test)

0.2054457979322346

In [13]:
cnf_matrix = confusion_matrix(y_test, yhat)
cnf_matrix

array([[ 351, 1999],
       [   8, 7411]], dtype=int64)

In [14]:
accuracy_score(y_test, yhat)

0.7945542020677654

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [17]:
y_pred  =  classifier.predict(X_test)
y_pred[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [18]:
confusion_matrix(y_test, y_pred)

array([[ 752, 1598],
       [ 359, 7060]], dtype=int64)

In [19]:
accuracy_score(y_test,y_pred)

0.7996724332070836

In [20]:
f1_score(y_test, y_pred, average='weighted')

0.7715341824208631

In [21]:
r2_score(y_pred, y_test)

-0.9875098736484873

In [22]:
mse(y_pred, y_test)

0.20032756679291636

In [27]:
from sklearn.neighbors import KNeighborsClassifier
k = 6
neigh6 = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
yhat6 = neigh6.predict(X_test)
print("Train set Accuracy: ", accuracy_score(y_train, neigh6.predict(X_train)))
print("Test set Accuracy: ", accuracy_score(y_test, yhat6))

Train set Accuracy:  0.8545542295542295
Test set Accuracy:  0.8086805200122837
