In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, precision_score, f1_score, roc_auc_score, roc_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import keras
from keras.models import Sequential
from keras.layers import Dense

In [98]:
train = pd.read_csv('../input/adultdataset/train.csv')

train_labels = pd.read_csv('../input/adultdataset/train_class_labels.csv')

test = pd.read_csv('../input/adultdataset/test.csv')
                           
                           

In [99]:
train.head()

In [100]:

# Merge the two dataframes, using _ID column as key
train = pd.merge(train, train_labels, on = 'Unnamed: 0')
train.set_index('Unnamed: 0', inplace = True)

# Write it to a new CSV file
train.to_csv('train.csv')

In [101]:
train.head()

In [102]:
train.info()

In [103]:
train.isnull().sum()

## Data imputation by replacing it with mode value

In [104]:
for column in ['workclass', 'occupation', 'native-country']:
    train[column].fillna(train[column].mode()[0], inplace=True)

In [105]:
train.isnull().sum()

In [106]:
train.describe()

## Outliers

In [107]:
import matplotlib.pyplot as plt
clist = ['fnlwgt','age','capital-gain','capital-loss','hours-per-week']
plt.figure(figsize = (12,6))
for i in range(0, len(clist)):
    plt.subplot(2,3, i+1)
    sns.boxplot(train[clist[i]], color = 'skyblue')
print("BoxPlots of the features:")
plt.show() 

In [108]:
train.columns

In [109]:
train.shape

In [110]:
train.dtypes

In [111]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

X = train[features].values

In [112]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

In [113]:
X[:, 1] = enc.fit_transform(X[:, 1])

X[:, 3] = enc.fit_transform(X[:, 3])

X[:, 5] = enc.fit_transform(X[:, 5])

X[:, 6] = enc.fit_transform(X[:, 6])

X[:, 7] = enc.fit_transform(X[:, 7])

X[:, 8] = enc.fit_transform(X[:, 8])

X[:, 9] = enc.fit_transform(X[:, 9])

X[:, 13] = enc.fit_transform(X[:, 13])

In [114]:

train[train.drop("income_>50K", axis = 1).columns.tolist()] = X


In [115]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
train.loc[:,'age':'native-country'] = scale.fit_transform(train.loc[:,'age':'native-country'])
train

In [116]:
X

In [117]:
y = train['income_>50K']
y.shape

In [118]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('income_>50K', 1), y, test_size = 0.25, random_state = 67)


In [119]:
np.unique(y, return_counts =True)

## Naive Bayes

In [120]:
nb = GaussianNB()

nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
roc_nb = roc_auc_score(y_test, y_pred_nb)

print(classification_report(y_test,y_pred_nb))

In [121]:
sns.heatmap(confusion_matrix(y_test, y_pred_nb),annot = True, fmt = 'd', cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.show()

## Decision Tree

In [122]:
dct = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)

dct.fit(X_train, y_train)

y_pred_dct = dct.predict(X_test)

print(classification_report(y_test, y_pred_dct))

In [123]:
sns.heatmap(confusion_matrix(y_test, y_pred_dct),annot = True, fmt = 'd', cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.show()

In [124]:
accuracy_dct = accuracy_score(y_test, y_pred_dct)
recall_dct = recall_score(y_test, y_pred_dct)
precision_dct = precision_score(y_test, y_pred_dct)
f1_dct = f1_score(y_test, y_pred_dct)
roc_dct = roc_auc_score(y_test, y_pred_dct)

## Random Forest

In [125]:
rft = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 67)

rft.fit(X_train, y_train)

y_pred_rft = rft.predict(X_test)

print(classification_report(y_test,y_pred_rft))

In [126]:
sns.heatmap(confusion_matrix(y_test, y_pred_rft),annot = True, fmt = 'd', cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.show()

In [127]:
accuracy_rft = accuracy_score(y_test, y_pred_rft)
recall_rft = recall_score(y_test, y_pred_rft)
precision_rft = precision_score(y_test, y_pred_rft)
f1_rft = f1_score(y_test, y_pred_rft)
roc_rft = roc_auc_score(y_test, y_pred_rft)

## kNN

In [128]:
knn = KNeighborsClassifier(n_neighbors=10, metric = 'minkowski', p = 4)

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print(classification_report(y_test, y_pred_knn))

In [129]:
accuracy_knn = accuracy_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
roc_knn = roc_auc_score(y_test, y_pred_knn)

In [130]:
sns.heatmap(confusion_matrix(y_test, y_pred_knn),annot = True, fmt = 'd', cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.show()

## Logistic Regression

In [131]:
lr = LogisticRegression(random_state = 42)

lr.fit(X_test, y_test)

y_pred_lr = lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))


In [132]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
roc_lr = roc_auc_score(y_test, y_pred_lr)

In [133]:
sns.heatmap(confusion_matrix(y_test, y_pred_lr),annot = True, fmt = 'd', cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.show()

## SVM(Support Vector Machine)

In [134]:
svm = SVC(kernel = 'linear', random_state = 42)

svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

print(classification_report(y_test,y_pred_svm))

In [135]:
sns.heatmap(confusion_matrix(y_test, y_pred_svm),annot = True, fmt = 'd', cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.show()

In [136]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
roc_svm = roc_auc_score(y_test, y_pred_svm)

## Comparing the results of all the five models

In [137]:
models = [('Naive Bayes', accuracy_nb, recall_nb, precision_nb, f1_nb, roc_nb),
          ('Decision Tree', accuracy_dct, recall_dct, precision_dct, f1_dct, roc_dct),
          ('Random Forest', accuracy_rft, recall_rft, precision_rft, f1_rft, roc_rft),
          ('kNN', accuracy_knn, recall_knn, precision_knn, f1_knn, roc_knn),
          ('Logistic Regression', accuracy_lr, recall_lr, precision_lr, f1_lr, roc_lr),
          ('SVM', accuracy_svm, recall_svm, precision_svm, f1_svm, roc_svm)]

df_all_models = pd.DataFrame(models, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC'])
df_all_models

In [138]:
plt.subplots(figsize=(12, 10))
sns.barplot(y = df_all_models['Accuracy (%)'], x = df_all_models['Model'], palette = 'icefire')
plt.xlabel("Models")
plt.title('Accuracy')
plt.show()

In [139]:
r_probs = [0 for _ in range(len(y_test))]
r_auc = roc_auc_score(y_test, r_probs)
r_fpr, r_tpr, _ = roc_curve(y_test, r_probs)

fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb)
fpr_dct, tpr_dct, _ = roc_curve(y_test, y_pred_dct)
fpr_rft, tpr_rft, _ = roc_curve(y_test, y_pred_rft)
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_pred_knn)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm)
# fpr_rn, tpr_rn, _ = roc_curve(y_test, y_pred_rn)

In [140]:
sns.set_style('darkgrid')

plt.plot(r_fpr, r_tpr, label='Random prediction (AUROC = %0.3f)' % r_auc)

plt.plot(fpr_nb, tpr_nb, label='Naive Bayes (AUROC = %0.3f)' % roc_nb)
plt.plot(fpr_dct, tpr_dct, label='Decision Tree (AUROC = %0.3f)' % roc_dct)
plt.plot(fpr_rft, tpr_rft, label='Random Forest (AUROC = %0.3f)' % roc_rft)
plt.plot(fpr_knn, tpr_knn, label='kNN (AUROC = %0.3f)' % roc_knn)
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (AUROC = %0.3f)' % roc_lr)
plt.plot(fpr_svm, tpr_svm, label='SVM (AUROC = %0.3f)' % roc_svm)

plt.title('ROC Plot')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend() 
plt.show()

SMOTE

In [142]:
plt.figure(figsize = (8, 4))
plt.title("Values distribution in target class: Income")
sns.countplot(data = train, x = 'income_>50K')
plt.show()

In [144]:
from imblearn.combine import SMOTETomek
# x = new_ds.loc[:,"age":"relationship_enc"]
y = train.loc[:,"income_>50K"]
smk = SMOTETomek()
X_new, y_new = smk.fit_resample(X, y)

In [145]:
plt.figure(figsize = (8, 4))
plt.title("Values in target class after using SMOTETomek")
sns.countplot(x = y_new)
plt.show()

Now we see that the dataset has become completely balanced

In [146]:
X_new.shape

In [147]:
x_train, x_test, y_train, y_test = train_test_split(X_new, y_new, test_size = 0.30, random_state =42)

##  Logistic Regression

In [148]:
lr = LogisticRegression(random_state = 42)

lr.fit(x_test, y_test)

y_pred_lr = lr.predict(x_test)

print(classification_report(y_test, y_pred_lr))


In [149]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
roc_lr = roc_auc_score(y_test, y_pred_lr)

##  Decision Tree Classifier


In [150]:
dct = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)

dct.fit(x_train, y_train)

y_pred_dct = dct.predict(x_test)

print(classification_report(y_test, y_pred_dct))

In [151]:
accuracy_dct = accuracy_score(y_test, y_pred_dct)
recall_dct = recall_score(y_test, y_pred_dct)
precision_dct = precision_score(y_test, y_pred_dct)
f1_dct = f1_score(y_test, y_pred_dct)
roc_dct = roc_auc_score(y_test, y_pred_dct)

##  K-Nearest Neighbour Classifier


In [152]:
knn = KNeighborsClassifier(n_neighbors=10, metric = 'minkowski', p = 4)

knn.fit(x_train, y_train)

y_pred_knn = knn.predict(x_test)

print(classification_report(y_test, y_pred_knn))

In [153]:
accuracy_knn = accuracy_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
roc_knn = roc_auc_score(y_test, y_pred_knn)

## Random Forest Classifier


In [154]:
rft = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 67)

rft.fit(x_train, y_train)

y_pred_rft = rft.predict(x_test)

print(classification_report(y_test,y_pred_rft))

In [155]:
accuracy_rft = accuracy_score(y_test, y_pred_rft)
recall_rft = recall_score(y_test, y_pred_rft)
precision_rft = precision_score(y_test, y_pred_rft)
f1_rft = f1_score(y_test, y_pred_rft)
roc_rft = roc_auc_score(y_test, y_pred_rft)

##  GaussianNB


In [156]:
nb = GaussianNB()

nb.fit(x_train, y_train)

y_pred_nb = nb.predict(x_test)

print(classification_report(y_test,y_pred_nb))

In [157]:
accuracy_nb = accuracy_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
roc_nb = roc_auc_score(y_test, y_pred_nb)

In [159]:
models = [('Naive Bayes', accuracy_nb, recall_nb, precision_nb, f1_nb, roc_nb),
          ('Decision Tree', accuracy_dct, recall_dct, precision_dct, f1_dct, roc_dct),
          ('Random Forest', accuracy_rft, recall_rft, precision_rft, f1_rft, roc_rft),
          ('kNN', accuracy_knn, recall_knn, precision_knn, f1_knn, roc_knn),
          ('Logistic Regression', accuracy_lr, recall_lr, precision_lr, f1_lr, roc_lr),
         ('SVM', accuracy_svm, recall_svm, precision_svm, f1_svm, roc_svm)]

df_all_models = pd.DataFrame(models, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC'])
df_all_models

In [160]:
plt.subplots(figsize=(12, 10))
sns.barplot(y = df_all_models['Accuracy (%)'], x = df_all_models['Model'], palette = 'icefire')
plt.xlabel("Models")
plt.title('Accuracy')
plt.show()

In [161]:
from sklearn.model_selection import cross_val_score

lr_scores = cross_val_score(lr, x_test, y_test, cv = 10) # cross validating the model
print(lr_scores) # accuracy scores of each cross validation cycle
print(f"Mean of accuracy scores is for Logistic Regression is {lr_scores.mean()*100}\n")

dct_scores = cross_val_score(dct, x_test, y_test, cv = 10)
print(dct_scores)
print(f"Mean of accuracy scores is for Decision Tree Classifier is {dct_scores.mean()*100}\n")

knn_scores = cross_val_score(knn, x_test, y_test, cv = 10)
print(knn_scores)
print(f"Mean of accuracy scores is for KNN Classifier is {knn_scores.mean()*100}\n")

rft_scores = cross_val_score(rft, x_test, y_test, cv = 10)
print(rft_scores)
print(f"Mean of accuracy scores is for Random Forest Classifier is {rft_scores.mean()*100}\n")

nb_scores = cross_val_score(nb, x_test, y_test, cv = 10)
print(nb_scores)
print(f"Mean of accuracy scores is for GaussianNB is {nb_scores.mean()*100}\n")



In [162]:
from sklearn.model_selection import cross_val_score

rfc_scores = cross_val_score(rft, x_test, y_test, cv = 10)
print(rfc_scores)
print(f"Mean of accuracy scores is for Random Forest Classifier is {rfc_scores.mean()*100}\n")

# Hyper Parameter Tuning for models

## 1) Random Forest

In [163]:
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier()
param = dict()
param['criterion'] = ['gini', 'entropy']
param['n_estimators'] = [1, 2,10, 100]
param['min_samples_split'] = [1,2,5,8,10]


gs = GridSearchCV(estimator = rfc, param_grid = param, scoring='f1', cv = 5, n_jobs = 3)
gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [165]:
rfc = RandomForestClassifier(criterion = 'entropy', min_samples_split = 2, n_estimators = 100)
rfc.fit(x_train, y_train)
print(rfc.score(x_test, y_test))
pred_rfc = rfc.predict(x_test)


In [166]:
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix, classification_report

print("Accuracy Score of RFC model is", accuracy_score(y_test, pred_rfc)*100)
print("Confusion matrix for RFC Model is")
print(confusion_matrix(y_test, pred_rfc))
print("Classification Report of the RFC Model is")
print(classification_report(y_test, pred_rfc))

plot_roc_curve(rfc, x_test, y_test) # arg. are model name, feature testing data, label testing data.
plt.title("Recevier's Operating Characteristic")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [167]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred_rfc)
plt.figure(figsize=(10,7))
sns.heatmap(cm,annot=True,fmt="d")
plt.xlabel("Predicted Value")
plt.ylabel("Truth")
plt.show()

In [168]:
models = [('Naive Bayes', accuracy_nb, recall_nb, precision_nb, f1_nb, roc_nb),
          ('Decision Tree', accuracy_dct, recall_dct, precision_dct, f1_dct, roc_dct),
          ('Random Forest', accuracy_rft, recall_rft, precision_rft, f1_rft, roc_rft),
          ('kNN', accuracy_knn, recall_knn, precision_knn, f1_knn, roc_knn),
          ('Logistic Regression', accuracy_lr, recall_lr, precision_lr, f1_lr, roc_lr),
          ('SVM', accuracy_svm, recall_svm, precision_svm, f1_svm, roc_svm)]

df_all_models = pd.DataFrame(models, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC'])
df_all_models

# 2) Decision Tree Classifier

In [170]:
from sklearn.model_selection import GridSearchCV

dct = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)



params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100,150],
    'criterion': ["gini", "entropy"]
}

grid_search = GridSearchCV(estimator=dct, param_grid=params,cv=5, n_jobs=1, verbose=1, scoring = "accuracy")
grid_search.fit(x_test, y_test)

In [171]:
grid_search.best_params_

In [172]:
grid_search.best_score_

In [173]:

dct = DecisionTreeClassifier(criterion="gini",max_depth=20,min_samples_leaf=20)
dct.fit(x_train,y_train)

In [174]:
y_pred = dct.predict(x_test)

In [175]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm,annot=True,fmt="d")
plt.xlabel("Predicted Value")
plt.ylabel("Truth")
plt.show()

In [177]:
print("Accuracy Score of Decision Tree model is", accuracy_score(y_test, y_pred)*100)
print("Confusion matrix for Decision Tree Model is")
print(confusion_matrix(y_test, y_pred))
print("Classification Report of the Decision Tree Model is")
print(classification_report(y_test, y_pred))

# PCA on data 

In [178]:
# Performing PCA

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train)

training_features_transformed = pca.transform(x_train)

from matplotlib import pyplot as plt

plt.plot(np.cumsum(pca.explained_variance_ratio_))   # As can be seen we take 30 components which captures almost all the variance
plt.xlabel('Number of components')
plt.ylabel('Cumulative variance')
plt.grid('True')
'''

'''
from sklearn.decomposition import PCA
pca = PCA(0.95)   #Select those many components that capture 95% of the variance 
pca.fit(x_train)

training_features_transformed = pca.transform(x_train)

train_data_array=np.asarray(training_features_transformed)
training_label_array=np.asarray(y_test)