<h1>Import and preparing library</h1>

In [0]:
import itertools

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

import pandas as pd
import scipy.optimize as opt
from sklearn import metrics
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
% matplotlib inline

<h1>Get Data and push into dataframe</h1>

In [0]:
link_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
link_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
headers = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race',\
           'sex','capital-gain','capital-loss','hours-per-week','native-country', 'possible']

train_dataframe = pd.read_csv(link_train, header=None, names=headers)
test_dataframe = pd.read_csv(link_test, header=None, names=headers)

test_dataframe.drop([0, 0], inplace=True)
test_dataframe = test_dataframe.reset_index()

labels = ['education','workclass','marital-status','occupation','relationship','race','native-country']
for label in labels:
  train_dataframe[label] = train_dataframe[label].astype('str')
  test_dataframe[label] = test_dataframe[label].astype('str')

train_dataframe['sex'] = preprocessing.LabelEncoder().fit([' Male',' Female']).transform(train_dataframe['sex'])
test_dataframe['sex'] = preprocessing.LabelEncoder().fit([' Male',' Female']).transform(test_dataframe['sex'])
train_dataframe['possible'] = preprocessing.LabelEncoder().fit([' <=50K',' >50K']).transform(train_dataframe['possible'])
test_dataframe['possible'] = preprocessing.LabelEncoder().fit([' <=50K.',' >50K.']).transform(test_dataframe['possible'])

<h2>One-hot encoding within a DataFrame </h2>

In [0]:
def oneHotEncoding(label):
  Feature = train_dataframe[[label]] #Other columns to be included
  Feature = pd.concat([train_dataframe[[label]],pd.get_dummies(train_dataframe[label])], axis=1) #One-hot encoding all categorical data in [label] column
  Feature.drop([label], axis = 1,inplace=True) #Remove column named label
  return Feature

train_data = train_dataframe
test_data = test_dataframe

for label in labels:
  train_data = pd.concat([train_data, oneHotEncoding(label)], axis=1)
  test_data = pd.concat([test_data, oneHotEncoding(label)], axis=1)
  train_data.drop(label, axis = 1,inplace=True)
  test_data.drop(label, axis = 1,inplace=True)

train_data.drop([' ?'], axis=1, inplace=True)
test_data.drop([' ?'], axis=1, inplace=True)
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [0]:
columns_for_datas = train_data.columns.values.tolist()
columns_for_datas.remove('possible')
columns_for_datas

#KNN

##Prepare Data

In [0]:
y_train = train_data['possible']
X_train = train_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]
y_test = test_data['possible']
X_test = test_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

##Centering and scaling

In [0]:
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_train.astype(float))

##Modeling

In [0]:
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)

##Prediction

In [0]:
yhat = neigh.predict(X_test)

##Evalution

In [9]:
jaccard_similarity_score(y_test, yhat)

0.8818832345443935

In [10]:
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy : ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.8602929885445779
Test set Accuracy :  0.8818832345443935


#Decision Tree

##Prepare Data

In [0]:
y_train = train_data['possible']
X_train = train_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]
y_test = test_data['possible']
X_test = test_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

##Centraling and Scalling

In [0]:
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_train.astype(float))

##Modeling

In [0]:
incomeTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

In [14]:
incomeTree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

##Prediction

In [0]:
yhat = incomeTree.predict(X_test)

##Evalution

###Jaccard Score

In [16]:
jaccard_similarity_score(y_test, yhat)

0.8818832345443935

In [17]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, yhat))

DecisionTrees's Accuracy:  0.8818832345443935


#Logistic Regression

##Prepare Data

In [0]:
y_train = train_data['possible']
X_train = train_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]
y_test = test_data['possible']
X_test = test_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

##Scaling

In [0]:
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_train.astype(float))

##Modeling

In [0]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

##Prediction

In [0]:
yhat = LR.predict(X_test)

In [0]:
yhat_prob = LR.predict_proba(X_test)

##Evalution

###Log loss

In [23]:
log_loss(y_test, yhat_prob)

0.4253673255125806

###F1 Score

In [24]:
f1_score(y_test, yhat, average='weighted') 

  'precision', 'predicted', average, warn_for)


0.8265316626392801

###Jaccard Index

In [25]:
jaccard_similarity_score(y_test, yhat)

0.8818832345443935

#SVM

##Prepare Data

In [0]:
# y_train = train_data['possible']
# X_train = train_data[columns_for_datas]
# y_test = test_data['possible']
# X_test = test_data[columns_for_datas]

y_train = train_data['possible']
X_train = train_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]
y_test = test_data['possible']
X_test = test_data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

##Scaling Data

In [0]:
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_train.astype(float))

##Modeling

In [28]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

##Prediction

In [0]:
yhat = clf.predict(X_test)

##Evaluation

###F1 Score

In [30]:
f1_score(y_test, yhat, average='weighted') 

  'precision', 'predicted', average, warn_for)


0.8265316626392801

###Jaccard Score

In [31]:
jaccard_similarity_score(y_test, yhat)

0.8818832345443935

<h1></h1>

<h1>Choosing Logistic Regression</h1>
<h2>Because this model have more properties for evaluation than others 
  Jaccard index
  F1-score
  LogLoss
</h2>