In [1]:
import numpy as np
import pandas as pd
import operator

In [2]:
train = pd.read_csv('hw1data/propublicaTrain.csv')
test = pd.read_csv('hw1data/propublicaTest.csv')

In [3]:
train.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
0,0,1,64,0,0,0,0,13,0,1
1,0,1,28,0,0,0,0,1,1,0
2,0,1,32,0,0,0,0,1,1,0
3,1,1,20,0,0,1,1,2,1,0
4,0,1,43,1,0,0,0,8,1,0


In [4]:
test.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
0,1,1,27,1,0,0,1,18,1,0
1,1,0,29,1,0,0,0,11,1,0
2,1,1,25,1,0,0,1,6,0,1
3,0,1,26,1,1,0,0,2,1,0
4,0,0,33,1,0,0,0,2,1,0


# Training

In [5]:
# Preprocessing
def preprocess(df):
    '''Remove c_charge_degree_M column due to redunduncy'''
    return df.drop(labels=['c_charge_degree_M'], axis=1)

train = preprocess(train)
test = preprocess(test)
features = list(train)[1:]
train.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F
0,0,1,64,0,0,0,0,13,0
1,0,1,28,0,0,0,0,1,1
2,0,1,32,0,0,0,0,1,1
3,1,1,20,0,0,1,1,2,1
4,0,1,43,1,0,0,0,8,1


In [6]:
for index, row in train.iterrows():
    for thing in row:
        print(thing)
    break

0
1
64
0
0
0
0
13
0


In [7]:
print(features)

['sex', 'age', 'race', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree_F']


In [8]:
def naive_bayes_train(df, features, target='two_year_recid', k=1):
    """returns class conditional probability distribution and class prior as dictionaries"""
    # initialize variables
    sample_size, feature_dim = df.shape[0], len(features)
    # print('sample_size, feature_dim: ', sample_size, feature_dim)
    y_count = {} # {key = label : value = count of label in sample},
    x_count = {} # dict of dicts of dicts = {key=label:value={key=feature:value={key=feature_val:value=count}}}
    feature_given_class = {} # Pr[X=x|Y=y]
    class_prior = {} # Pr[Y=y]
    
    # initialize dictionary keys
    for label in df[target].unique():
        y_count[label], class_prior[label] = 0, 0
        x_count[label], feature_given_class[label] = {}, {}
        for feature in features:
            x_count[label][feature], feature_given_class[label][feature] = {}, {}
    
    # print('y_count: ', y_count)
    # print('x_count: ', x_count)
    
    # update dictionary values
    for index, row in df.iterrows():
        label = row[target]
        y_count[label] += 1
        for feature in features:
            feature_val = row[feature]
            if feature_val in x_count[label][feature].keys():
                x_count[label][feature][feature_val] += 1
            else:
                x_count[label][feature][feature_val] = 1
    
    # print('y_count: ', y_count)
    # print('x_count: ', x_count)
    
    # find class prior probabilities
    for label in class_prior.keys():
        class_prior[label] = y_count[label]/sample_size
    
    # find feature given class probabilities with additive smoothing
    for label in feature_given_class.keys():
        for feature in feature_given_class[label].keys():
            for feature_val in x_count[label][feature].keys():
                feature_given_class[label][feature][feature_val] = \
                (x_count[label][feature][feature_val] + k) / (y_count[label] + k * feature_dim)
    # print('class prior: ', class_prior)
    # print('f|c: ', feature_given_class)
    return class_prior, feature_given_class, y_count

In [9]:
class_prior, feature_given_class, y_count = naive_bayes_train(train, features)

In [10]:
def predict(series, features, class_prior, feature_given_class, y_count, k=1):
    """given a series, return a belief distribution over possible labels"""
    belief = {}
    for label in class_prior.keys():
        prob = class_prior[label]
        for feature in features:
            feature_val = series[feature]
            # print(feature, feature_val)
            if feature_val in feature_given_class[label][feature]:
                prob *= feature_given_class[label][feature][feature_val]
            else:
                prob *= k/(y_count[label] + k * len(features))
        belief[label] = prob
    return belief

In [11]:
for index, row in test.iterrows():
    print(row)
    print(predict(row, features, class_prior, feature_given_class, y_count))
    break

two_year_recid        1
sex                   1
age                  27
race                  1
juv_fel_count         0
juv_misd_count        0
juv_other_count       1
priors_count         18
c_charge_degree_F     1
Name: 0, dtype: int64
{0: 1.1501565434922019e-07, 1: 1.6887029622681945e-06}


In [12]:
def evaluate(df, features, class_prior, feature_given_class, y_count, target = 'two_year_recid'):
    total_pred, accurate_pred = 0, 0
    for index, row in df.iterrows():
        y = row[target]
        belief = predict(row, features, class_prior, feature_given_class, y_count)
        y_pred = max(belief.items(), key=operator.itemgetter(1))[0]
        total_pred += 1
        if y_pred == y:
            accurate_pred += 1
    return accurate_pred/total_pred
evaluate(test, features, class_prior, feature_given_class, y_count)

0.6875