In [1]:
import numpy as np
import pandas as pd

In [2]:
def data_processing(data):
    #replace all "?" with NAN
    data.replace(" ?", np.nan, inplace=True)
    modes = data.mode().iloc[0]
    data.fillna(modes, inplace=True)
    
    #abstract income from data
    income = data['income_per_year']
    data.drop(['income_per_year'], axis=1, inplace=True)
    
    #one-hot-coding for data
    cat_attrs = [attr for attr in data if type(data[attr][0]) is str and not (data[attr][0]).isdigit()]
    
    for cat_attr in cat_attrs:
        data[cat_attr] = pd.Categorical(data[cat_attr])
        dummies = pd.get_dummies(data[cat_attr], prefix='{}_category'.format(cat_attr))
        data = pd.concat([data, dummies], axis=1)

    data.drop(cat_attrs,axis=1,inplace=True)
    
    
    #one-hot-coding for income
    income = pd.Categorical(income)
    income_dummy = pd.get_dummies(income)
    todrop = " >50K"
    income_dummy.drop(todrop, axis=1, inplace=True)
    
    return data, income_dummy

In [3]:
attr = ["age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "income_per_year"]

data = pd.read_csv("dataset/adult.data", names=attr)

In [4]:
data, income = data_processing(data)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(data)
X = scaler.transform(data)
Y = income

split = 32651 #size of the training data

X_train = X[:split]
X_test = X[split:]

Y_train = Y[:split]
Y_test = Y[split:]

In [6]:
#evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score

  from numpy.core.umath_tests import inner1d


In [7]:
models = [LogisticRegression(), RandomForestClassifier(), DecisionTreeClassifier(), MLPClassifier()]

for model in models:
    model.fit(X_train, Y_train.values.reshape(-1,))
    Y_pred = model.predict(X_test)
    print('accuracy：', accuracy_score(Y_pred, Y_test.values))

accuracy： 0.8501636711753443
accuracy： 0.8418257056389352
accuracy： 0.8136001482304984
accuracy： 0.8417639429312581
