In [1]:
import string
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score
import string 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16514\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16514\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##### Read Dataset

In [2]:
imdb_data = pd.read_csv('./data/IMDB Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


##### Split the data

In [3]:
train_reviews=imdb_data.review[:30000]
train_sentiments=imdb_data.sentiment[:30000]

val_reviews=imdb_data.review[30000:40000]
val_sentiments=imdb_data.sentiment[30000:40000]

test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]

##### Preprocess Dataset

In [4]:
def preprocess(reviews):
    container = []
    for review in reviews:
        review = review.replace("<br />", "")
        for ele in string.punctuation:
                if ele in review:
                        review = review.replace(ele, "")
        container.append(review)
    return container

def preprocess_sentiment(sentiments):
    container = []
    for sentiment in sentiments:
        container.append(sentiment)
    return container

In [5]:
train_reviews = preprocess(train_reviews)
val_reviews = preprocess(val_reviews)
test_reviews = preprocess(test_reviews)
train_sentiments = preprocess_sentiment(train_sentiments)
val_sentiments = preprocess_sentiment(val_sentiments)
test_sentiments = preprocess_sentiment(test_sentiments)

##### Naive Bayes Model Train

In [6]:
'''
Method that take in the training dataset, then return the positive and negative words log probability.
Input: train_reviews: reviews (sentences) for training
       train_sentiments: sentiments (label) for training
       val_reviews: reviews (sentences) for validation
       test_reviews: reviews (sentences) for testing
       tfidf: boolean variable indicating whether using bow or tfidf
       alpha: laplance smoothing variable, default to be 1.0
       ngram_range: the scale of ngram model will be used, default = (1,1) unigram
return: negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class
        positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class
        mnb: the trained multinomial naive bayes model, later can be used for testing
        transformed_val_reviews: transformed val reviews that later can be used for validation
        transformed_test_reviews: transformed test reviews that later can be used for testing
        vec: either the tfidfVectorize build from tfidf model or the CountVectorizer build from Bag of word model.
'''

def generate_log_prob(train_reviews, train_sentiments,  test_reviews, val_reviews=None, tfidf=False, alpha=1.0, ngram_range = (1,1)):

    if (tfidf):
        #Tfidf vectorizer
        vec=TfidfVectorizer(use_idf=tfidf, ngram_range=ngram_range)
        transformed_train_reviews=vec.fit_transform(train_reviews)
        if val_reviews is not None:
            transformed_val_reviews = vec.transform(val_reviews)
        transformed_test_reviews=vec.transform(test_reviews)
    else:
        vec=CountVectorizer(ngram_range=(1,1))
        transformed_train_reviews=vec.fit_transform(train_reviews)
        if val_reviews is not None:
            transformed_val_reviews = vec.transform(val_reviews)
        transformed_test_reviews=vec.transform(test_reviews)

    #training the model
    mnb = MultinomialNB(alpha=alpha)

    #fitting the naive bayes for bag of words
    mnb = mnb.fit(transformed_train_reviews, train_sentiments)
    
    negative_log_prob = mnb.feature_log_prob_[0]
    positive_log_prob = mnb.feature_log_prob_[1]

    # Generate two dict: word:log_prob
    negative_word_log_prob_dict = {}
    positive_word_log_prob_dict = {}
    for word, index in vec.vocabulary_.items():
        negative_word_log_prob_dict[word] = negative_log_prob[index]
        positive_word_log_prob_dict[word] = positive_log_prob[index]
    if val_reviews is None:
        return negative_word_log_prob_dict, positive_word_log_prob_dict, mnb, transformed_test_reviews, vec
    else:
        return negative_word_log_prob_dict, positive_word_log_prob_dict, mnb, transformed_val_reviews, transformed_test_reviews, vec 

##### Naive Bayes Model Inference

In [7]:
def naive_inference(reweight, pos, neg, unseen_pos, unseen_neg, test_reviews, test_sentiments):
    correct = 0
    prediction = []
    pattern = r'[^A-Za-z0-9]+'
    for i in range(len(test_reviews)):
        word_list = test_reviews[i]
        word_list = re.sub(pattern, " ", word_list.lower()).split()
        # word_list = test_reviews[i].strip().lower().split()
        final_result = 0
        
        for word in word_list:
            weight = 1
            pprob = unseen_pos
            nprob = unseen_neg
            if word in reweight:
                weight = reweight[word]
            if word in pos:
                pprob = pos[word]
            if word in neg:
                nprob = neg[word]
            final_result += weight*pprob - weight*nprob
        if final_result > 0:
            prediction.append("positive")
        elif final_result < 0:
            prediction.append("negative")
        if (final_result > 0 and test_sentiments[i] == "positive") or (final_result < 0 and test_sentiments[i] == "negative"):
            correct += 1
    return prediction, correct/len(test_reviews)

##### Reweight by Count Algorithm

In [None]:
# Todo

##### Reweight by Value Algorithm

In [8]:
def calculate_reweight_dict(pos, neg, dataset, threshold=0.1):
    wrongly_classified_dict = {} # counts how many times a token has negative impact on wrongly classified sentence
    token_dict = {} # counts how many times a token has appeared in total
    reweight_dict = {}
    for label, sentence in dataset:
        real_label = label
        tokens = sentence
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = 1
            else:
                token_dict[token] += 1
        if real_label == "positive": # marked as negative
            for token in tokens:
                if token not in pos: continue
                elif pos[token] < neg[token]:
                    if token not in wrongly_classified_dict:
                        wrongly_classified_dict[token] = 1
                    else:
                        wrongly_classified_dict[token] += 1
        elif real_label == "negative": # marked as positive
            for token in tokens:
                if token not in pos: continue
                elif neg[token] < pos[token]:
                    if token not in wrongly_classified_dict:
                        wrongly_classified_dict[token] = 1
                    else:
                        wrongly_classified_dict[token] += 1
    for word, prob in wrongly_classified_dict.items():
        weight = 0
        if pos[word] < neg[word]:
            weight = pos[word] - neg[word]
        elif neg[word] < pos[word]:
            weight = neg[word] - pos[word]
        multiplier = 1 - wrongly_classified_dict[word] / token_dict[word]
        if multiplier == 0:
            reweight_dict[word] = 0
        else:
            reweight_dict[word] = np.exp(weight) * multiplier
    return reweight_dict

##### Perceptron Reweighting Algorithm

In [16]:
def perceptron_update(positive_prob_dict, negative_prob_dict, unseen_pos, unseen_neg, val_reviews, val_sentiments, test_reviews, test_sentiments, max_iter=10, learning_rate=0.02):
    """
    need to handle unseen word here
    has to be a method
    """
    initial_weight = {}
    for i in range(max_iter):
        random_array = np.arange(len(val_sentiments))
        np.random.shuffle(random_array)
        correct = 0
        for index in random_array:
            word_list = val_reviews[index]
            word_list = re.sub(pattern, " ", word_list.lower()).split()
            final_result = 0
            for word in word_list:
                weight = 1
                pprob = unseen_pos
                nprob = unseen_neg
                if word not in initial_weight:
                    initial_weight[word] = 1
                if word in initial_weight:
                    weight = initial_weight[word]
                if word in positive_prob_dict:
                    pprob = positive_prob_dict[word]
                if word in negative_prob_dict:
                    nprob = negative_prob_dict[word]
                final_result += weight*pprob - weight*nprob
            if (final_result > 0 and val_sentiments[index] == "positive") or (final_result < 0 and val_sentiments[index] == "negative"):
                correct += 1
                for word in word_list:
                    initial_weight[word] -= learning_rate/len(val_sentiments)*initial_weight[word]
            else:
                for word in word_list:
                    pprob = unseen_pos
                    nprob = unseen_neg
                    if word in positive_prob_dict:
                        pprob = positive_prob_dict[word]
                    if word in negative_prob_dict:
                        nprob = negative_prob_dict[word]
                    if val_sentiments[index] == "positive":
                        initial_weight[word] += learning_rate*((pprob-nprob) - 1/len(val_sentiments)*initial_weight[word])
                    else:
                        initial_weight[word] += learning_rate*((nprob-pprob) - 1/len(val_sentiments)*initial_weight[word])
        test_pred, test_acc = naive_inference(initial_weight, positive_prob_dict, negative_prob_dict, unseen_pos, unseen_neg, test_reviews, test_sentiments)
        val_accuracy = correct/len(val_sentiments)
        print(f"finish training epoch {i}, the val accuracy is {val_accuracy}, the test accuracy is {test_acc}")
        
    return initial_weight, val_accuracy, test_acc

##### Running - Training

In [10]:
neg, pos, mnb, transformed_val_reviews, transformed_test_reviews, vec= generate_log_prob(train_reviews, train_sentiments, test_reviews, val_reviews=val_reviews, alpha = 1)
unseen_neg = mnb.feature_log_prob_[0].min()
unseen_pos = mnb.feature_log_prob_[1].min()
print(unseen_neg, unseen_pos)

-15.032443811652895 -15.049387153559886


##### Running - Baseline Inference

In [11]:
val_pred_baseline, val_acc_baseline = naive_inference({}, pos, neg, unseen_pos, unseen_neg, val_reviews, val_sentiments)
test_pred_baseline, test_acc_baseline = naive_inference({}, pos, neg, unseen_pos, unseen_neg, test_reviews, test_sentiments)
print(f"val accuracy is {val_acc_baseline}, test accuracy is {test_acc_baseline}")


0.8441
0.8472
val accuracy is 0.8441, test accuracy is 0.8472


##### Running - Count Reweight Inference

##### Running - Value Reweight Inference

In [12]:
wrong_labeled = []
pattern = r'[^A-Za-z0-9]+'

for i in range(len(val_reviews)):
    start_index = 30000
    if(val_pred_baseline[i] != val_sentiments[i]):
        sentiment = val_sentiments[i]
        review = str(imdb_data['review'][start_index + i])
        review = re.sub(pattern, " ", review.lower()).split()
        wrong_labeled.append((sentiment, review))

reweight_dict_value = calculate_reweight_dict(pos, neg, wrong_labeled)
val_pred_value, val_acc_value = naive_inference(reweight_dict_value, pos, neg, unseen_pos, unseen_neg, val_reviews, val_sentiments)
test_pred_value, test_acc_value = naive_inference(reweight_dict_value, pos, neg, unseen_pos, unseen_neg, test_reviews, test_sentiments)
print(f"val accuracy is {val_acc_value}, test accuracy is {test_acc_value}")

0.9031
0.841
val accuracy is 0.9031, test accuracy is 0.841


##### Running - Perceptron Reweight Inference

In [17]:
reweight, val_acc_perceptron, test_acc_perceptron = perceptron_update(pos, neg, unseen_pos, unseen_neg, val_reviews, val_sentiments, test_reviews, test_sentiments)

0.873
finish training epoch 0, the val accuracy is 0.8623, the test accuracy is 0.873
0.8821
finish training epoch 1, the val accuracy is 0.8836, the test accuracy is 0.8821
0.8825
finish training epoch 2, the val accuracy is 0.894, the test accuracy is 0.8825
0.8832
finish training epoch 3, the val accuracy is 0.9055, the test accuracy is 0.8832
0.8844
finish training epoch 4, the val accuracy is 0.9099, the test accuracy is 0.8844
0.884
finish training epoch 5, the val accuracy is 0.9195, the test accuracy is 0.884
0.8862
finish training epoch 6, the val accuracy is 0.9238, the test accuracy is 0.8862
0.8855
finish training epoch 7, the val accuracy is 0.9316, the test accuracy is 0.8855
0.8858
finish training epoch 8, the val accuracy is 0.9346, the test accuracy is 0.8858
0.8862
finish training epoch 9, the val accuracy is 0.9414, the test accuracy is 0.8862


##### Record Weight Dict

In [14]:
weight_printed_version = dict(sorted(reweight.items(), key=lambda item: item[1]))
with open("weight_dict.txt", "w") as f:
    for k, v in weight_printed_version.items():
        f.write(str(k))
        f.write(": ")
        f.write(str(v))
        f.write('\n')


In [15]:
import pickle

with open('weight.pickle', 'wb') as handle:
    pickle.dump(reweight, handle, protocol=pickle.HIGHEST_PROTOCOL)

##### Graphing

In [None]:
# Todo
val_set = (val_acc_baseline, val_acc_value, val_acc_perceptron)
test_set = (test_acc_baseline, test_acc_value, test_acc_perceptron)