In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import csv 
import sklearn
from sklearn import model_selection
from sklearn import metrics
from sklearn import tree
from sklearn import naive_bayes

In [2]:
X = []
y = []
for line in csv.reader(open("house-votes-84.data")):
    # chose y=2, ?=1, n=0 because for the algo that sklearn uses it breaks it by ranges
    # in this case a ? means did not vote which seems to me in many cases to be abstention
    # which is in between a yes and a no, this is not the only interpretation: it might 
    # also mean absent if a congressperson was sick, FWIW there are arguments to other methods
    to_num = {"republican": 0, "democrat": 1, "y": 2, "n": 0, "?": 1} 
    line = [to_num[v] for v in line]
    y += [line[0]]
    X += [line[1:]]
X = np.array(X)
y = np.array(y)

In [3]:
def drop_missing(X, y):
    Xout = []
    yout = []
    for i, j in zip(X, y):
        if 1 in i: # if "?" in this row
            continue
        Xout.append(i)
        yout.append(j)
    Xout = np.array(Xout)
    yout = np.array(yout)
    return Xout, yout

X_missing, y_missing = drop_missing(X, y)

In [4]:
def impute_missing(X, y):
    Xout = X.copy()
    yout = y.copy()
    
    for i in range(X.shape[1]):
        missings = (Xout[:, i] == 1) # missings is column array whose elements are true when a row is == "?"
        yeses = (Xout[:, i] == 2).sum()  # take advantage the True = 1 when summed
        noes = (Xout[:, i] == 0).sum()
        if yeses > noes:
            impute_value = 2
        else:
            impute_value = 0
        Xout[missings, i] = impute_value # where misssings is true in column i, set impute value
    
    return Xout, yout

X_impute, y_impute = impute_missing(X, y)

In [5]:
def eval_one(X, y, model, version=""):
    kf = model_selection.KFold(n_splits=5)

    f1, precision, recall = [], [], []
    for train_index, test_index in kf.split(X):    
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 += [metrics.f1_score(y_test, y_pred)]
        precision += [metrics.precision_score(y_test, y_pred)]
        recall += [metrics.recall_score(y_test, y_pred)]
    

    
    print("{}:{}:\n    prec={} +/-{}\n    recall={} +/-{}\n    f1={} +/-{}".format(
        model.__class__.__name__, version,
        np.mean(precision), np.std(precision),
        np.mean(recall), np.std(recall),
        np.mean(f1), np.std(f1)))
    
    

In [6]:
eval_one(X, y, naive_bayes.BernoulliNB(), "ternary")
eval_one(X, y, tree.DecisionTreeClassifier(), "ternary")

eval_one(X_missing, y_missing, naive_bayes.BernoulliNB(), "missing")
eval_one(X_missing, y_missing, tree.DecisionTreeClassifier(), "missing")

eval_one(X_impute, y_impute, naive_bayes.BernoulliNB(), "impute")
eval_one(X_impute, y_impute, tree.DecisionTreeClassifier(), "impute")


BernoulliNB:ternary:
    prec=0.9398006379585327 +/-0.018600560093907845
    recall=0.8865591955214598 +/-0.07850105772547056
    f1=0.9109087450666425 +/-0.0468727447446568
DecisionTreeClassifier:ternary:
    prec=0.9672559093150668 +/-0.029977200982392847
    recall=0.947299398714493 +/-0.04212800700083204
    f1=0.9560404889228419 +/-0.01736908307853755
BernoulliNB:missing:
    prec=0.9505027156751295 +/-0.03747089010954538
    recall=0.8892857142857142 +/-0.09141120866882128
    f1=0.9173666489455963 +/-0.06173334206629656
DecisionTreeClassifier:missing:
    prec=0.9575303257462178 +/-0.039991484015640434
    recall=0.9583333333333333 +/-0.052704627669472995
    f1=0.9569951706277721 +/-0.036155169177220794
BernoulliNB:impute:
    prec=0.944111437189207 +/-0.008932482593477833
    recall=0.8903327804271199 +/-0.0732580349605667
    f1=0.915065233176399 +/-0.0413602104797454
DecisionTreeClassifier:impute:
    prec=0.9671388819378409 +/-0.029523498006898435
    recall=0.9441322828115