In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'            
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/income/train.csv')
test = pd.read_csv('../input/income/test.csv')

In [None]:
train['education'] = train['educational-num']
test['education'] = test['educational-num']
train = train.drop(columns=['educational-num'], axis=1)
test = test.drop(columns=['educational-num'], axis=1)

In [None]:
train_null, test_null = [], []
print('=== Train Data ===')
for col in train.columns:
    x = set(train[col])
    y = train[col].isnull().sum()
    train_null.append(col) if y != 0 else None
    print(f'[{col}] unique data: {len(x)}. with {y} nulls')

print()
print('=== Test Data ===')
for col in test.columns:
    x = set(train[col])
    y = train[col].isnull().sum()
    test_null.append(col) if y != 0 else None
    print(f'[{col}] unique data: {len(x)}. with {y} nulls')
    
print()
print(f'Train data with null: {train_null}')
print(f'Test data with null: {test_null}')

In [None]:
train=train.dropna()
test=test.dropna()

def dictionarize(data):
    temp = set(data)
    return { j:i+1 for i,j in enumerate(temp)}

columns_to_classify = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

for col in columns_to_classify:
    temp_dict = dictionarize(train[col])
    train[col] = [temp_dict[i] for i in train[col]]
    test[col] = [temp_dict[i] for i in test[col]]

train.rename(columns={'income_>50K':'target'}, inplace=True)

train.head()

In [None]:
x = train.drop(columns=['target'], axis=1)
y = train['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=.125)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_val = scaler.transform(x_val)

In [None]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_train.shape)
print(x_val.shape, y_val.shape)

In [None]:
base_model = Sequential()
base_model.add(Dense(13, input_dim=13, activation='relu'))
base_model.add(Dense(2, activation='relu'))
base_model.add(Dense(1, activation='sigmoid'))
base_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
base_model.summary()


cust_model = Sequential()
cust_model.add(Dense(13, kernel_initializer='he_uniform', input_dim=13, activation='relu'))
cust_model.add(BatchNormalization())
cust_model.add(Dense(32, kernel_initializer='he_uniform', activation='relu'))
cust_model.add(BatchNormalization())
cust_model.add(Dense(16, kernel_initializer='he_uniform', activation='relu'))
cust_model.add(Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid'))
cust_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cust_model.summary()

base_model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=10, epochs=20)
print('='*50)
cust_model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=10, epochs=20)

In [None]:
train_loss = base_model.history.history['loss']
valid_loss = base_model.history.history['val_loss']
train_acc = base_model.history.history['accuracy']
valid_acc = base_model.history.history['val_accuracy']
epochs = len(train_loss)

plt.plot(range(1,epochs+1), train_loss, label='train')
plt.plot(range(1,epochs+1), valid_loss, label='validation')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Train and Validation Loss Plot (Base Model)')
plt.legend()
plt.show()

plt.plot(range(1,epochs+1), train_acc, label='train')
plt.plot(range(1,epochs+1), valid_acc, label='validation')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Train and Validation Accuracy Plot (Base Model)')
plt.legend()
plt.show()

In [None]:
train_loss = cust_model.history.history['loss']
valid_loss = cust_model.history.history['val_loss']
train_acc = cust_model.history.history['accuracy']
valid_acc = cust_model.history.history['val_accuracy']
epochs = len(train_loss)

plt.plot(range(1,epochs+1), train_loss, label='train')
plt.plot(range(1,epochs+1), valid_loss, label='validation')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Train and Validation Loss Plot (Custom Model)')
plt.legend()
plt.show()

plt.plot(range(1,epochs+1), train_acc, label='train')
plt.plot(range(1,epochs+1), valid_acc, label='validation')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Train and Validation Accuracy Plot (Custom Model)')
plt.legend()
plt.show()

In [None]:
prediction = np.round(base_model.predict(x_test)).flatten().astype(int)

print("=========================================================\n")
print("                    Base Model Result")
print("=========================================================\n")

acc = accuracy_score(y_test, prediction)
print("=========================================================\n")
print("Predicted Class (20 Samples):")
print(prediction[:20])
print("\nGround Truth (20 Samples):")
print(y_test.values[:20])

print("\n=========================================================\n")
accuracy = accuracy_score(y_test, prediction)
accuracy = accuracy*100
print(f"Accuracy: {accuracy}%")

f1 = f1_score(y_test, prediction, average='macro')
print(f"F1 Score: {f1}")

auc = roc_auc_score(y_test, prediction, average='macro')
print(f"AUC Score: {auc}")

print('\n\nClassification Report:')
cr = classification_report(y_test, prediction)
print(cr)

In [None]:
prediction = np.round(cust_model.predict(x_test)).flatten().astype(int)

print("=========================================================\n")
print("                 Custom Model Result")
print("=========================================================\n")

acc = accuracy_score(y_test, prediction)
print("=========================================================\n")
print("Predicted Class (20 Samples):")
print(prediction[:20])
print("\nGround Truth (20 Samples):")
print(y_test.values[:20])

print("\n=========================================================\n")
accuracy = accuracy_score(y_test, prediction)
accuracy = accuracy*100
print(f"Accuracy: {accuracy}%")

f1 = f1_score(y_test, prediction, average='macro')
print(f"F1 Score: {f1}")

auc = roc_auc_score(y_test, prediction, average='macro')
print(f"AUC Score: {auc}")

print('\n\nClassification Report:')
cr = classification_report(y_test, prediction)
print(cr)

In [None]:
tmp = pd.DataFrame(confusion_matrix(y_test, prediction), index = ['positive', 'negative'], columns = ['true', 'false'])
sns.heatmap(tmp, annot=True, fmt='g')