Libraries

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

Reading training and test data

In [8]:
column=['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
train_data = pd.read_csv('adult.data', names=column)
test_data = pd.read_csv('adult.test', names=column)

Displaying data

In [23]:
print("training data\n")
print(train_data.head())
print("testing data\n")
print(test_data.head())

training data

   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   income  workclass_?  workclass_Federal-gov  workclass_Local-gov  ...  \
0       0            0                      0                    0  ...   
1       0            0                      0                    0  ...   
2       0            0                      0                    0  ...   
3       0            0                      0                    0  ...   
4       0            0                      0                    0  ...   

   native_country_Portugal  native_country_Puerto-Rico  \
0            

Training samples data cleaning

In [9]:
#Dropping coloums with duplicte data as both column education and education num contain data of same kind the only difference
#is that the data of education is categorical and of education_num is numerical
train_data.drop('education', inplace=True, axis=1)
columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
#removing spaces from data so that data can be accessed easily
for column in columns:
    train_data[column] = train_data[column].str.strip()
#assigning binary lables to the output class 
train_data['income'] = train_data.income.replace({"<=50K": 0, ">50K": 1})
# Using get_dummies to convert the categorical variable/feature to numerical variable
train_data = pd.get_dummies(train_data)
#splitting in x_train and y_train sets
X_train = train_data.drop('income', axis=1)
X_train = X_train.drop('native_country_Holand-Netherlands', axis=1)
y_train = train_data['income']

Test samples data cleaning

In [10]:
#Dropping coloums with duplicte data as both column education and education num contain data of same kind the only difference
#is that the data of education is categorical and of education_num is numerical
test_data.drop('education', inplace=True, axis=1)
#removing spaces from data so that data can be accessed easily
columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
for column in columns:
    test_data[column] = test_data[column].str.strip()
#assigning binary lables to the output class 
test_data['income'] = test_data.income.replace({"<=50K.": 0, ">50K.": 1})
# Using get_dummies to convert the categorical variable/feature to numerical variable
test_data = pd.get_dummies(test_data)
#splitting in x_test and y_test sets
X_test = test_data.drop('income', axis=1)
y_test = test_data['income']

Sklearn KNN

In [11]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
predict = knn.predict(X_test)
print("KNN classifier")
print(f"Accuracy: {accuracy_score(y_test, predict)}")
tp, fn, fp, tn = confusion_matrix(y_test, predict,labels=[1,0]).reshape(-1)
print("Precision:", tp/(tp+fp))
print("Recall:", tp/(tp+fn))
print("Specificity:", tn/(tn+fp))
print("F1-score:", f1_score(y_test, predict))

KNN classifier
Accuracy: 0.7766107732940237
Precision: 0.5461776403004861
Recall: 0.3213728549141966
Specificity: 0.9174105347808604
F1-score: 0.40464887870355215


Sklearn Naive Bayes

In [12]:
nb = GaussianNB()
nb_fit = nb.fit(X_train,y_train)
prdeict = nb_fit.predict(X_test)
print("Naive Bayes classifier")
print(f"Accuracy: {accuracy_score(y_test, prdeict)}")
tp, fn, fp, tn = confusion_matrix(y_test, prdeict,labels=[1,0]).reshape(-1)
print("Precision:", tp/(tp+fp))
print("Recall:", tp/(tp+fn))
print("Specificity:", tn/(tn+fp))
print("F1-score:", f1_score(y_test, prdeict))

Naive Bayes classifier
Accuracy: 0.7957742153430379
Precision: 0.6427397260273973
Recall: 0.3049921996879875
Specificity: 0.94756735022115
F1-score: 0.41368365367660026


Sklearn Logistic Regression 

In [13]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
predict = lr.predict(X_test)
print("Logistic Regression classifier")
print(f"Accuracy: {accuracy_score(y_test, predict)}")
tp, fn, fp, tn = confusion_matrix(y_test, predict,labels=[1,0]).reshape(-1)
print("Precision:", tp/(tp+fp))
print("Recall:", tp/(tp+fn))
print("Specificity:", tn/(tn+fp))
print("F1-score:", f1_score(y_test, predict))

Logistic Regression classifier
Accuracy: 0.7978011178674529
Precision: 0.6915629322268326
Recall: 0.26001040041601664
Specificity: 0.9641334941696823
F1-score: 0.37792894935752075


Logistic Regression Model from scratch

In [21]:
def train(x,y,iteration,lr):
    m,n = x.shape
    w = np.zeros(n)      
    b=0
    for i in range(iteration):            
        update_weights(m,n,w,b,x,y,lr)     
    return w,x,b

def update_weights(m,n,w,b,X,Y,lr):
    #wx+b
    wxb = X.dot(w) + b
    #taking exp
    exp_wxb = np.exp(- wxb)
    inv = 1/(1+exp_wxb)
    
    #gradient calculation
    g = inv - Y.T
    res = np.reshape(g,m)
    dw = np.dot(X.T, res)
    dw /= m
    db = np.sum(res) / m
    
    #weight updation
    w = w - lr * dw   
    b = b - lr * db

def predict( w,X,b ) :    
    #wx+b
    wxb = X.dot(w) + b
    #taking exp
    exp_wxb = np.exp(- wxb)
    inv = 1/(1+exp_wxb)  
    res = np.where( inv > 0.5, 1, 0 )        
    return res
    
w,x,b=train( X_train, y_train,100,0.01 )    
prediction = predict(w,X_test,b)  
print(f"Accuracy: {accuracy_score(y_test, prediction)}") 

Accuracy: 0.7637737239727289


# Comparative Analysis

Logistic regression classifier produces the maximum accuracy for the given census income dataset. Logistic regression classifier implemented from scratch also produces good results which are almost similar to the ones produced by the sklearn model which is an accuracy of 76%. 

Evaluation of all three models have been drawn using the evaluation metrics accuracy,precision,recall,f1 score and specificity. Confusion matrix is obtained and on the basis of the true positives,true negatives, false positives and false negatives obtained we can see that the logistic regression has the max precision of 0.69 which tells us that the model has classified the actual labels 1 correctly for most cases. Similarly naive bayes has the highest recall value and logistic regression has the lowest. As 1 F1-score is the best and our model naives bayes produces the max f1 score of 0.41 out of the three models which tells us that naive bayes classifier worked best in classification of both the classes that is income > 50k and income <= 50k correctly.