# HW2 Pandas method

## imports

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd 
import sklearn
from sklearn import model_selection
from sklearn import metrics
from sklearn import tree
from sklearn import naive_bayes

## Load data

Here we load using the na_values options so we can take advantage of functions like `isna` and `fillna`.  There are plenty of ways to do this but this is my preferred way.

In [2]:
df = pd.read_csv("house-votes-84.data", header=None, na_values="?")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


Again more than one way to do this but replace makes it easy to turn your string values into numerical values in pandas dataframe.  Why choose 0 and 2?  Keep reading!

In [3]:
to_num = {"republican": 0, "democrat": 1, "y": 2, "n": 0} 
df_num = df.replace(to_num)
df_num

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,,2.0,2.0,2.0,0.0,2.0
1,0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,
2,1,,2.0,2.0,,2.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0
3,1,0.0,2.0,2.0,0.0,,2.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,2.0
4,1,2.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,2.0
431,1,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
432,0,0.0,,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0
433,0,0.0,0.0,0.0,2.0,2.0,2.0,,,,,0.0,2.0,2.0,2.0,0.0,2.0


## Data processing

### Drop rows

In [4]:
# originally I did this like in the numpy example where I split off X and y but it made using dropna difficult 
# It's worth looking at this code and understanding what it does. 
def drop_missing_old(X, y):
    mask = []
    #buid a mask of columns where all values are notna
    for i, row in X.iterrows():
        mask += [np.all(row.notna())]
    Xout = X[mask]
    yout = y[mask]
    return Xout, yout


def drop_missing(df):
    return df.dropna()
df_missing = drop_missing(df_num)
df_missing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
5,1,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0
8,0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0
19,1,2.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0
23,1,2.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
25,1,2.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,1,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0,0.0,0.0,2.0,2.0,2.0
426,1,2.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0
427,0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,2.0
430,0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,2.0


### Imputation

Here we go over each column and find it's mode and then use `fillna` to replace the missing values with the mode.

In [5]:
def impute_missing(df):
    df_impute = df.copy()
    
    for col in df_impute.columns[1:]:
        mode = df_impute[col].mode(dropna=True)[0]  # mode returns a series since it could be multiple values
        df_impute[col].fillna(mode, inplace=True)
    return df_impute

df_impute = impute_missing(df_num)
df_impute

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,2.0
1,0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0
2,1,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0
3,1,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,2.0
4,1,2.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,2.0
431,1,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
432,0,0.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0
433,0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,2.0


### Ternary

This one is really simple such that I won't even write a function for it fill missing values with 1.  Why did I chose n=0, y=2, and ?=1? Because for the algo that sklearn uses breaks the feature by ranges in this case a ? means did not vote which seems to me in many cases to be abstention which is in between a yes and a no -- this is not the only interpretation: it might also mean absent if a congressperson was sick.  For what it's worth there are arguments to other choices.

In [6]:
df_ternary = df_num.fillna(1.)

## Evaluate 

This goes into a function because I want to run the same code multiple times in multiple ways so I parameterize what changes and reuse code.  

In [7]:
def eval_one(df, model, version=""):
    X = df.iloc[:, 1:]
    y = df[0]
    kf = model_selection.KFold(n_splits=5)

    f1, precision, recall = [], [], []
    for train_index, test_index in kf.split(X):    
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 += [metrics.f1_score(y_test, y_pred)]
        precision += [metrics.precision_score(y_test, y_pred)]
        recall += [metrics.recall_score(y_test, y_pred)]
    

    
    print("{}:{}:\n    prec={} +/-{}\n    recall={} +/-{}\n    f1={} +/-{}".format(
        model.__class__.__name__, version,
        np.mean(precision), np.std(precision),
        np.mean(recall), np.std(recall),
        np.mean(f1), np.std(f1)))
    
    

In [15]:
df_ternary.astype(int) + 1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1,1,3,1,3,3,3,1,1,1,3,2,3,3,3,1,3
1,1,1,3,1,3,3,3,1,1,1,1,1,3,3,3,1,2
2,2,2,3,3,2,3,3,1,1,1,1,3,1,3,3,1,1
3,2,1,3,3,1,2,3,1,1,1,1,3,1,3,1,1,3
4,2,3,3,3,1,3,3,1,1,1,1,3,2,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,1,1,1,3,3,3,3,1,1,3,3,1,3,3,3,1,3
431,2,1,1,3,1,1,1,3,3,3,3,1,1,1,1,1,3
432,1,1,2,1,3,3,3,1,1,1,1,3,3,3,3,1,3
433,1,1,1,1,3,3,3,2,2,2,2,1,3,3,3,1,3


In [18]:
eval_one(df_ternary, naive_bayes.BernoulliNB(), "ternary")
eval_one(df_ternary, naive_bayes.GaussianNB(), "ternary")
eval_one(df_ternary, naive_bayes.MultinomialNB(), "ternary")
eval_one(df_ternary, tree.DecisionTreeClassifier(), "ternary")
print()
eval_one(df_missing, naive_bayes.BernoulliNB(), "missing")
eval_one(df_missing, tree.DecisionTreeClassifier(), "missing")
print()
eval_one(df_impute, naive_bayes.BernoulliNB(), "impute")
eval_one(df_impute, tree.DecisionTreeClassifier(), "impute")

BernoulliNB:ternary:
    prec=0.9398006379585327 +/-0.018600560093907845
    recall=0.8865591955214598 +/-0.07850105772547056
    f1=0.9109087450666425 +/-0.0468727447446568
GaussianNB:ternary:
    prec=0.9553120842351255 +/-0.014147245376651204
    recall=0.950855276798673 +/-0.04098770855241024
    f1=0.9525539809187171 +/-0.021179224243151936
MultinomialNB:ternary:
    prec=0.9551915031320066 +/-0.016064060082525734
    recall=0.8827130416753057 +/-0.07897176744404288
    f1=0.9158678832081888 +/-0.04565130236678046
DecisionTreeClassifier:ternary:
    prec=0.9630291387810186 +/-0.029409496397046316
    recall=0.9434532448683391 +/-0.03970491562666246
    f1=0.952239908218359 +/-0.01985176360883642

BernoulliNB:missing:
    prec=0.9505027156751295 +/-0.03747089010954538
    recall=0.8892857142857142 +/-0.09141120866882128
    f1=0.9173666489455963 +/-0.06173334206629656
DecisionTreeClassifier:missing:
    prec=0.9575303257462178 +/-0.039991484015640434
    recall=0.944047619047619 +/