In [5]:
import numpy as np
import matplotlib.pyplot as plt
import os
import re

In [6]:
def preprocess(line):
    # TODO remove <br> tags
    # TODO stemming for later parts
    # break into words?
    break_rgx = re.compile(r"<[a-z ]*/>")
    line = break_rgx.sub(" ", line)
    punctuation_rgx = re.compile(r"[^a-zA-Z ]")
    line = punctuation_rgx.sub("", line)
    words = re.split(r"\s+", line.lower())
    return words


In [26]:
# Vocabulary class
# Should support:
# 1. Getting a word by index
# 2. Getting an index by word
# 3. Adding a word 

class Vocabulary:
    
    def __init__(self):
        self.word_map = {}
        self.index_map = []
        self.next_word_idx = 0
    
    def add(self, word):
        if word not in self.word_map:
            self.word_map[word] = self.next_word_idx+1
            self.index_map.append(word)
            self.next_word_idx += 1
        return self.word_map[word]
    
    def __contains__(self, other):
        return other in self.word_map
    
    def get_index(self, index):
        return self.index_map[index]
    
    def get_word(self, word):
        return self.word_map[word]
    
# model
class MultinomialEventModel:
    
    def __init__(self, n):
        self.n = n
        self.cond_params = [[] for i in range(n)]
        self.cond_params_denom = np.zeros(n)
        self.params = np.zeros(n)
        
        self.word_map = {}
        self.index_map = [] # will this be used?
        self.next_word_idx = 0


    def fit(self, dataset):
        for (e,c) in dataset:
            self.params[c] += 1
            self.cond_params_denom[c] += len(e)
            for w in e:
                if w not in self.word_map:
                    self.word_map[w] = self.next_word_idx
                    # self.index_map.append(word)
                    self.next_word_idx += 1
                    for l in self.cond_params:
                        l.append(0)
                idx = self.word_map[w]
                self.cond_params[c][idx] += 1
        
        for i in range(self.n):
            for j in range(len(self.cond_params[i])):
                # no smoothing, see below
                self.cond_params[i][j] = (self.cond_params[i][j]+1)/(self.cond_params_denom[i]+self.next_word_idx)
            self.params[i] = self.params[i]/len(dataset)
        
    def predict(self, reviews):
        preds = np.zeros(len(reviews))
        for (j,r) in enumerate(reviews):
            p_list = np.log(self.params)
            for w in r:
                # if word is not in our vocabulary, then we'll need to ignore it...
                # consequently, we don't need smoothing, as every word we're considering will have a nonzero 
                # probability associated with it.
                #
                # what a dumb problem
                if w in self.word_map:
                    for i in range(self.n):
                        p_list[i] += np.log(self.cond_params[i][self.word_map[w]])
            preds[j] = np.argmax(p_list)
        
        return preds

## Testing

In [18]:
# load data
dataset = []
# 1 - positive, 0 - negative
datapath = '../data/part1_data/train'
for f in os.listdir(f'{datapath}/pos'):
    file = open(f'{datapath}/pos/{f}')
    review = preprocess(file.readlines()[0])
    dataset.append((review,1))
    
for f in os.listdir(f'{datapath}/neg'):
    file = open(f'{datapath}/neg/{f}')
    review = preprocess(file.readlines()[0])
    dataset.append((review,0))


In [27]:
model = MultinomialEventModel(2)
model.fit(dataset)

In [20]:
datapath = '../data/part1_data/test'

testdata_pos = []
for f in os.listdir(f'{datapath}/pos'):
    file = open(f'{datapath}/pos/{f}')
    review = preprocess(file.readlines()[0])
    testdata_pos.append(review)
    
testdata_neg = []
for f in os.listdir(f'{datapath}/neg'):
    file = open(f'{datapath}/neg/{f}')
    review = preprocess(file.readlines()[0])
    testdata_neg.append(review)

In [30]:
predcnt_pos = np.count_nonzero(model.predict(testdata_pos) == 1)
predcnt_neg = np.count_nonzero(model.predict(testdata_neg) == 0)

acc = (predcnt_pos+predcnt_neg)/(len(testdata_pos)+len(testdata_neg))
print(predcnt_pos)
print(predcnt_neg)
print(f"({predcnt_pos}+{predcnt_neg})/({len(testdata_pos)}+{len(testdata_neg)}) = {acc}")

7501
4396
(7501+4396)/(10000+5000) = 0.7931333333333334


In [29]:
model.cond_params

[[0.007388558433362986,
  0.026816054865633966,
  0.00822712358545285,
  0.012053566113122948,
  0.0005878460782195884,
  0.0026861299963089526,
  7.858359031754915e-05,
  0.002982774545040134,
  0.00048306795779618953,
  0.004962672794599164,
  0.0006412557045393079,
  0.023393076139334487,
  6.769755183200121e-05,
  1.530849162030178e-05,
  2.5854341403176342e-05,
  0.013820846423511121,
  2.4153397889809476e-05,
  0.00013879699069073616,
  0.0010698934699077578,
  9.79743463699314e-05,
  1.3607548106934916e-06,
  0.016986982679292204,
  0.0008290398684150098,
  0.0007742694872845967,
  4.082264432080475e-06,
  2.0411322160402377e-06,
  0.0009375600645678158,
  3.401887026733729e-07,
  0.055270118334640225,
  1.0205661080201188e-06,
  0.0011185404543900502,
  0.0034804706170512784,
  6.701717442665446e-05,
  5.340962631971955e-05,
  0.001195082912491559,
  4.2183399131498244e-05,
  1.0205661080201188e-06,
  3.0616983240603563e-06,
  3.4359058970010665e-05,
  0.006977270291830879,
  0