In [1]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import numpy as np
from matplotlib import pyplot as plt
import nltk
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer  
from nltk.corpus import words
from copy import deepcopy

In [2]:
data = pd.read_csv("movie_data.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def preproc_text(text):
    text = re.sub(r"\d+", "", text.lower())
    for i in string.punctuation:
        text = text.replace(i,"")
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    sample = [i for i in tokens if not i in stop_words]
    lancaster_stemmer = LancasterStemmer() 
    
    return [lancaster_stemmer.stem(word) for word in sample]


In [4]:
review = data["review"] # create a data frame with only data 
review = review.apply(preproc_text)

In [5]:
review

0        [on, review, ment, watch, oz, episod, youl, ho...
1        [wond, littl, produc, br, br, film, techn, una...
2        [thought, wond, way, spend, tim, hot, sum, wee...
3        [bas, ther, famy, littl, boy, jak, think, ther...
4        [pet, mat, lov, tim, money, vis, stun, film, w...
                               ...                        
49995    [thought, movy, right, good, job, wasnt, cre, ...
49996    [bad, plot, bad, dialog, bad, act, idiot, dire...
49997    [cathol, taught, paroch, el, schools, nun, tau...
49998    [im, going, disagr, prevy, com, sid, maltin, o...
49999    [on, expect, star, trek, movy, high, art, fan,...
Name: review, Length: 50000, dtype: object

In [6]:
def create_wordlist(original_train_data, threshold=26):
    """
    Create a word list from the original training set.
    Only get a word if it appears in at least $threshold emails.
    Returns:
        * a python list containing all the words that occur in at least $threshold emails.
    """
    
    c = Counter()
    for i in original_train_data:
        c.update(i)

    c = dict(c)
    c = {k:v for k,v in c.items() if v >= threshold}
    
    return list(c.keys())

In [7]:
wordlist = create_wordlist(review[:40000], 26)
wordlist.sort()

In [8]:
def split_train(original_train_data, size=30000):
    return original_train_data[:size], original_train_data[size:]
def binary_transform(sentiment):
    if sentiment == "positive":
        return 1
    return 0

In [9]:
label = data["sentiment"]
label = label.apply(binary_transform)
cleandata = pd.DataFrame({"label":label, "wordlist":review})
train, validation = split_train(cleandata)
validation

Unnamed: 0,label,wordlist
30000,1,"[on, quit, cartoon, scooby, doo, film, scooby,..."
30001,0,"[on, silliest, movy, ev, misfortun, watch, exp..."
30002,0,"[reason, fatherinlaw, gav, cop, tap, think, gr..."
30003,1,"[may, hard, explain, film, masterpiec, perhap,..."
30004,0,"[show, worst, show, ev, nor, famy, writ, produ..."
...,...,...
49995,1,"[thought, movy, right, good, job, wasnt, cre, ..."
49996,0,"[bad, plot, bad, dialog, bad, act, idiot, dire..."
49997,0,"[cathol, taught, paroch, el, schools, nun, tau..."
49998,0,"[im, going, disagr, prevy, com, sid, maltin, o..."


In [10]:
class Model:
    def __init__(self, wordlist):
        self.wordlist = wordlist

    def count_labels(self, data):
        """
        Count the number of positive labels and negative labels.
        Returns (a tuple or a numpy array of two elements):
            * negative_count: a non-negative integer, which represents the number of negative labels (non-spam emails);
            * positive_count: a non-negative integer, which represents the number of positive labels (spam emails).
        """
        # TODO
        c = Counter(data[:,0])
        return np.array([c[0], c[1]])

    def count_words(self, wordlist, data):
        """
        Count the number of times that each word appears in emails under a given label.
        Returns (a numpy array):
            * word_counts: a numpy array with shape (2, L), where L is the length of $wordlist,
                - word_counts[0, i] represents the number of times that word $wordlist[i] appears in non-spam (negative) emails, and
                - word_counts[1, i] represents the number of times that word $wordlist[i] appears in spam (positive) emails.
        """
        L = len(wordlist)
        positive = data[data[:,0]==1][:,1]
        negative = data[data[:,0]==0][:,1]
        
        output = np.zeros(2*L).reshape(2, L)
        
        for review in negative:
            review = set(review)
            for i in range(L):
                if wordlist[i] in review:
                    output[0, i] += 1
            
        for review in positive:
            review = set(review)
            for j in range(L):
                if wordlist[j] in review:
                    output[1, j] += 1
        
        return output
            

    def calculate_probability(self, label_counts, word_counts):
        """
        Calculate the probabilities, both the prior and likelihood.
        Returns (a pair of numpy array):
            * prior_probs: a numpy array with shape (2, ), only two elements, where
                - prior_probs[0] is the prior probability of negative labels, and
                - prior_probs[1] is the prior probability of positive labels.
            * likelihood_probs: a numpy array with shape (2, L), where L is the length of the word list,
                - likelihood_probs[0, i] represents the likelihood probability of the $i-th word in the word list, given that the email is non-spam (negative), and
                - likelihood_probs[1, i] represents the likelihood probability of the $i-th word in the word list, given that the email is spam (positive).
        """
        # TODO
        prior_probs = deepcopy(label_counts)/sum(label_counts)
        
        L = word_counts.shape[1]
        likelihood_probs = np.zeros(2*L).reshape(2, L)
        
        for i in range(L):
            likelihood_probs[0, i] = (word_counts[0, i] + 1)/(label_counts[0] + 2)
            likelihood_probs[1, i] = (word_counts[1, i] + 1)/(label_counts[1] + 2)
            
        return prior_probs, likelihood_probs

    def fit(self, data):
        label_counts = self.count_labels(data)
        word_counts = self.count_words(self.wordlist, data)

        self.prior_probs, self.likelihood_probs = self.calculate_probability(label_counts, word_counts)

        # TO AVOID NUMBER OVERFLOW here we use log probability instead.
        
        self.log_prior_probs = np.log(self.prior_probs)
        self.log_likelihood_probs = np.dstack([np.log(1 - self.likelihood_probs), np.log(self.likelihood_probs)])

    def predict(self, x):
        """
        Predict whether email $x is a spam or not.
        Returns:
            * y: a boolean value indicating whether $x is a spam or not.
        """        
        # proc the data
        L = len(self.wordlist)
        feature_vector = np.zeros(L, dtype="int8")
        word_set = set(x)
        for i in range(L):
            if self.wordlist[i] in word_set:
                feature_vector[i] = 1
        
        neg, pos = self.log_prior_probs[0], self.log_prior_probs[1]
        
        for i in range(L):
            neg += self.log_likelihood_probs[0, i, feature_vector[i]]
            pos += self.log_likelihood_probs[1, i, feature_vector[i]]
        
        if pos > neg:
            return 1
        else:
            return 0
            

In [11]:
train = np.array(train)
classifer = Model(wordlist)
classifer.fit(train)

In [12]:
val_data = np.array(validation)[:1000]
error_count = 0
total = len(val_data)
for mail in val_data:
    if mail[0] != classifer.predict(mail[1]):
        error_count += 1
        
error_percentage = error_count*100/total
    

print("Validation error, # = {:>4d}, % = {:>8.4f}%.".format(error_count, error_percentage))

Validation error, # =  157, % =  15.7000%.


In [14]:
a = "Fuck your asshole stupid"
a = preproc_text(a)

print("positive" if classifer.predict(a)==1 else "negative")

negative
