In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Load/Clean Datasets Helpers

In [2]:
def strip_spaces(in_str):
    return in_str.strip()

def transform_salary(in_str):
    return 1 if in_str.strip() == '>50K' else 0


def load_adult(path):
    dataset = pd.read_csv(path, names=["age", "workclass", "fnlwgt", "education", "education-num", 
                                              "marital-status", "occupation", "relationship", "race", 
                                              "sex", "capital-gain", "capital-loss", "hours-per-week", 
                                              "native-country", "salary"], converters={"workclass": strip_spaces, 
                                                                                       "education": strip_spaces, 
                                                                                       "marital-status": strip_spaces, 
                                                                                       "occupation": strip_spaces, 
                                                                                       "relationship": strip_spaces, 
                                                                                       "race": strip_spaces, 
                                                                                       "sex": strip_spaces, 
                                                                                       "native-country": strip_spaces, 
                                                                                       "salary": transform_salary})
    valid_workclass = ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", 
                       "Local-gov", "State-gov", "Without-pay", "Never-worked"]
    valid_education = ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", 
                       "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", 
                       "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"]
    valid_marital_status = ["Married-civ-spouse", "Divorced", "Never-married", 
                            "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"]
    valid_occupation = ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", 
                        "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", 
                        "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"]
    valid_relationship = ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"]
    valid_race = ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"]
    valid_sex = ["Female", "Male"]
    valid_native_country = ["United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", 
                            "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", 
                            "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", 
                            "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", 
                            "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", 
                            "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", 
                            "Peru", "Hong", "Holand-Netherlands"]
    original_size = len(dataset)
    rows_to_keep = [val in valid_workclass for val in dataset["workclass"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_education for val in dataset["education"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_marital_status for val in dataset["marital-status"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_occupation for val in dataset["occupation"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_relationship for val in dataset["relationship"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_race for val in dataset["race"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_sex for val in dataset["sex"]]
    dataset = dataset[rows_to_keep]
    rows_to_keep = [val in valid_native_country for val in dataset["native-country"]]
    dataset = dataset[rows_to_keep]
    size = len(dataset)
    return dataset, size/original_size


def load_german(path):
    dataset = pd.read_csv(path, header=None, delimiter=r'\s+')
    return dataset

# Run Helper

In [3]:
def run(X, y, test_split=0.3):
    kfold = model_selection.KFold(n_splits=5,random_state=7)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=42)
    for i, model in enumerate(models):
        cv_result = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        score = model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        acc_score = accuracy_score(y_test, prediction)
        print ('-'*40)
        print ('{0}: {1}'.format(names[i], acc_score))

In [4]:
# load adult dataset
df, pct = load_adult('datasets/adult/adult.data')
X_orig = df.iloc[:, :-1]
y_orig = df.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

percentage of corrupt rows: 7.4%


# Models to Run

In [5]:
models = []
names = ['LR', 'Random Forest', 'Neural Network', 'GaussianNB', 'DecisionTreeClassifier', 'SVM']

models.append((LogisticRegression()))
models.append((RandomForestClassifier(n_estimators=100)))
models.append((MLPClassifier()))
models.append((GaussianNB()))
models.append((DecisionTreeClassifier()))
models.append((SVC()))

# Adult Dataset

In [6]:
df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

percentage of corrupt rows: 7.4%


# expanding

In [7]:
X_expand = pd.get_dummies(X_adult)
run(X_expand, y_adult)



----------------------------------------
LR: 0.7848381036578628
----------------------------------------
Random Forest: 0.8528014145209415




----------------------------------------
Neural Network: 0.8010829925958669
----------------------------------------
GaussianNB: 0.7867167642833462
----------------------------------------
DecisionTreeClassifier: 0.8104762957232844




----------------------------------------
SVM: 0.745938777765499


# LabelEncoder

In [8]:
encoders = {"workclass": preprocessing.LabelEncoder(), 
            "education": preprocessing.LabelEncoder(), 
            "marital-status": preprocessing.LabelEncoder(), 
            "occupation": preprocessing.LabelEncoder(), 
            "relationship": preprocessing.LabelEncoder(), 
            "race": preprocessing.LabelEncoder(), 
            "sex": preprocessing.LabelEncoder(), 
            "native-country": preprocessing.LabelEncoder()}

X_encode = X_adult.copy()
for col, encoder in encoders.items():
    X_encode[col] = encoder.fit_transform(X_encode[col])

In [9]:
run(X_encode, y_adult)



----------------------------------------
LR: 0.7903635760857554
----------------------------------------
Random Forest: 0.8550116034920986




----------------------------------------
Neural Network: 0.7748922532876561
----------------------------------------
GaussianNB: 0.786164217040557
----------------------------------------
DecisionTreeClassifier: 0.8045087855011603




----------------------------------------
SVM: 0.7475964194938667


# German Dataset

In [10]:
# load german dataset
df_german = load_german('datasets/german/german.data-numeric')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

# transform y to binary
y_map = {2: 1, 1: 0}
y_german = y_german.map(y_map).astype(int)

In [11]:
run(X_german, y_german)

----------------------------------------
LR: 0.76




----------------------------------------
Random Forest: 0.7666666666666667




----------------------------------------
Neural Network: 0.7766666666666666
----------------------------------------
GaussianNB: 0.7633333333333333
----------------------------------------
DecisionTreeClassifier: 0.6666666666666666
----------------------------------------
SVM: 0.7


