In [12]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [302]:
from sklearn.cross_validation import train_test_split
from collections import Counter, defaultdict
import math

class NaiveBayesProb:
    
    def __init__(self):
        self.raw_data = False
        pass
    
    def read_raw_data(self):
        self.target_domain = set()
        self.attribute_domains = defaultdict(set)
        self.raw_x = []
        self.raw_y = []
        with open("/Users/rbekbolatov/data/uci/mushroom/agaricus-lepiota.data", "r") as datafile:
            for line in datafile:
                els = line.rstrip('\n').split(',')
                self.raw_y.append(els[0])
                self.raw_x.append(els[1:])
        self.raw_data = True
        
    def train_and_evaluate(self, test_fraction=0.2): 
        self.xs_log_prob_cache = {}
        self.train(test_fraction)
        return self.evaluate()
        
    def train(self, test_fraction=0.2): 
        if not self.raw_data:
            self.read_raw_data()
        x_train, x_test, y_train, y_test = train_test_split(self.raw_x, self.raw_y, test_size=test_fraction, random_state=42)
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        
        self.n_train = len(self.y_train)
        self.n_test = len(self.y_test)
        self.count()
        
    def count(self):
        self.counts = Counter()
        self.y_counts = Counter()
        for (y, xs) in zip(self.y_train, self.x_train):
            self.y_counts[y] += 1
            self.target_domain.add(y)
            for (i, x) in enumerate(xs):
                self.counts[(i, y, x)] += 1
                self.attribute_domains[i].add(x)
                
    def y_prior_log_prob(self, y):
        return math.log(self.y_counts[y] * 1.0 / self.n_train)
    
    def x_i_class_cond_log_prob(self, idx, x, given_y, lap=1.0):
        return math.log((self.counts[(idx, given_y, x)] + lap) * 1.0 / (self.y_counts[given_y] + lap*len(self.attribute_domains[idx])))
    
    def xs_class_cond_log_prob(self, xs, given_y):
        # Naive Bayes assumption p((x1,..., xk)|y)=p(x1|y)*p(x2|y)*...*p(xk|y)
        return self.xs_class_cond_NB_log_prob(xs, given_y)

    def xs_class_cond_NB_log_prob(self, xs, given_y):
        return sum([self.x_i_class_cond_log_prob(i, xs[i], given_y) for i in range(len(self.attribute_domains))])

    def xs_y_joint_log_prob(self, xs, y):
        return self.y_prior_log_prob(y) + self.xs_class_cond_log_prob(xs, y)

    def xs_log_prob(self, xs):
        cached = self.xs_log_prob_cache.get(tuple(xs))
        if cached is None:
            cached = math.log(sum([math.exp(self.xs_y_joint_log_prob(xs, y)) for y in self.target_domain]))
            self.xs_log_prob_cache[tuple(xs)] = cached
        return cached

    # NBC, After Bayes
    def y_posterior_log_prob_NBC(self, y, given_xs):
        return self.xs_y_joint_log_prob(given_xs, y) - self.xs_log_prob(given_xs)
    
    def y_posterior_prob_NBC(self, y, given_xs):
        return math.exp(self.y_posterior_log_prob_NBC(y, given_xs))
    
    def y_posterior_prob_dist_NBC(self, given_xs):
        return {y: self.y_posterior_prob_NBC(y, given_xs) for y in self.target_domain}
    
    def y_predict_NBC(self, given_xs):
        max_log_prob = None
        max_log_prob_target = None
        for y in self.target_domain:
            prob = self.y_posterior_log_prob_NBC(y, given_xs)
            if max_log_prob is None or max_log_prob < prob:
                max_log_prob = prob
                max_log_prob_target = y
        return (max_log_prob_target, math.exp(max_log_prob))
    
    # Evaluate against test set:
    def test_accuracy_NBC(self):
        return sum([1 if self.y_predict_NBC(xs)[0] == y else 0 for (xs, y) in zip(self.x_test, self.y_test)]) * 1.0 / self.n_test
    
    def test_accuracy_LOGREG(self):
        # placeholder
        return self.test_accuracy_NBC()
    
    
    def evaluate(self):
        baseline_accuracy = 1.0 - math.exp(max([self.y_prior_log_prob(y) for y in self.target_domain]))
        nbc_accuracy = self.test_accuracy_NBC()
        logreg_accuracy = self.test_accuracy_LOGREG()
        print('Baseline accuracy: ' + str(baseline_accuracy))
        print('Naive Bayes classifier accuracy: ' + str(nbc_accuracy))
        print('Logistic Regression classifier accuracy: ' + str(logreg_accuracy))
        return (baseline_accuracy, nbc_accuracy, logreg_accuracy)
        

In [303]:

for i in [0.8, 0.5, 0.4, 0.3, 0.2, 0.1]:
    print(i)
    %time NaiveBayesProb().train_and_evaluate(i)

0.8
Baseline accuracy: 0.479064039409
Naive Bayes classifier accuracy: 0.941384615385
Logistic Regression classifier accuracy: 0.941384615385
CPU times: user 1.02 s, sys: 5.58 ms, total: 1.03 s
Wall time: 1.03 s
0.5
Baseline accuracy: 0.477351058592
Naive Bayes classifier accuracy: 0.946085672083
Logistic Regression classifier accuracy: 0.946085672083
CPU times: user 696 ms, sys: 3.33 ms, total: 700 ms
Wall time: 702 ms
0.4
Baseline accuracy: 0.480919162905
Naive Bayes classifier accuracy: 0.947076923077
Logistic Regression classifier accuracy: 0.947076923077
CPU times: user 584 ms, sys: 1.94 ms, total: 586 ms
Wall time: 587 ms
0.3
Baseline accuracy: 0.481005979599
Naive Bayes classifier accuracy: 0.945857260049
Logistic Regression classifier accuracy: 0.945857260049
CPU times: user 467 ms, sys: 1.21 ms, total: 469 ms
Wall time: 469 ms
0.2
Baseline accuracy: 0.482228035082
Naive Bayes classifier accuracy: 0.950769230769
Logistic Regression classifier accuracy: 0.950769230769
CPU times: