In [1]:
# Read Data

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
name = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv('adult.data', names = name)

In [2]:
def preprocess_data(df):
    
    # Replace junk data with null and drop

    df = df.replace(' ?', np.nan)
    df = df.dropna().reset_index(drop=True)
    
    # Drop Columns

    drop_columns = ['education']
    df = df.drop(drop_columns, axis=1)
    
    # Column encoding
    column_names_for_onehot = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
    df = pd.get_dummies(df, columns=column_names_for_onehot, drop_first=True)
    
    drop_columns = ['income']
    X = df.drop(drop_columns, axis=1)

    y=pd.get_dummies(df['income'])
    y=y.iloc[:,1].values
    
    # Train Test Split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
    
    return X_train, X_test, y_train, y_test;

In [3]:
def evaluate(y_test, pred, text):
    from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    accuracy = accuracy_score(y_test, pred)
    print('Accuracy: %f' % accuracy)
    
    precision = precision_score(y_test, pred)
    print('Precision: %f' % precision)
    
    recall = recall_score(y_test, pred)
    print('Recall: %f' % recall)
    
    f1 = f1_score(y_test, pred)
    print('F1 score: %f' % f1)

    conf_mat = confusion_matrix(y_test, pred)
#     fig, ax = plt.subplots(figsize=(8,8))
    sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion matrix - " + text, size=16);
    print();