In [1]:
import string
import re
import math
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
imdb_data = pd.read_csv('./IMDB_Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [4]:
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]

test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]

train_reviews.shape,train_sentiments.shape, test_reviews.shape,test_sentiments.shape


((40000,), (40000,), (10000,), (10000,))

In [5]:
train_sentiments.value_counts()

negative    20007
positive    19993
Name: sentiment, dtype: int64

In [6]:
'''
Method that take in the training dataset, then return the positive and negative words log probability.
Input: train_reviews: reviews (sentences) for training
       train_sentiments: sentiments (label) for training
       tfidf: boolean variable indicating whether using bow or tfidf
       alpha: laplance smoothing variable, default to be 1.0
       ngram_range: the scale of ngram model will be used, default = (1,1) unigram
return: negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class
        positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class
        mnb: the trained multinomial naive bayes model, later can be used for testing
        transformed_test_reviews: transformed test reviews that later can be used for testing
        vec: either the tfidfVectorize build from tfidf model or the CountVectorizer build from Bag of word model.
'''

def generate_log_prob(train_reviews, train_sentiments, tfidf=False, alpha=1.0, ngram_range = (1,1)):

    if (tfidf):
        #Tfidf vectorizer
        vec=TfidfVectorizer(use_idf=tfidf, ngram_range=ngram_range)
        #transformed train reviews
        transformed_train_reviews=vec.fit_transform(train_reviews)
        #transformed test reviews
        transformed_test_reviews=vec.transform(test_reviews)
    else:
        vec=CountVectorizer(ngram_range=(1,1))
        transformed_train_reviews=vec.fit_transform(train_reviews)
        transformed_test_reviews=vec.transform(test_reviews)

    #training the model
    mnb = MultinomialNB(alpha=alpha)

    #fitting the naive bayes for bag of words
    mnb = mnb.fit(transformed_train_reviews, train_sentiments)
    negative_log_prob = mnb.feature_log_prob_[0]
    positive_log_prob = mnb.feature_log_prob_[1]

    # Generate two dict: word:log_prob
    negative_word_log_prob_dict = {}
    positive_word_log_prob_dict = {}
    for word, index in vec.vocabulary_.items():
        negative_word_log_prob_dict[word] = negative_log_prob[index]
        positive_word_log_prob_dict[word] = positive_log_prob[index]
    
    return negative_word_log_prob_dict, positive_word_log_prob_dict, mnb, transformed_test_reviews, vec

In [7]:
'''
This method will take in a word:scale dict, then take in the negative and positive word:log_probability dict, manually change the weight of the words in the model and the dict
Input: word_change_scale: this is the word-scale dictionary, how much the weight of the word should be changed, For example if the value is 0.5, we will say 
                          the probability of the word in negative class should multiply 0.5, in original probability, we take power to the scale
       model: the trained naive bayes model, which the feature_log_prob_ attribute will be manually changed based on previous two params
       negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class, which some values will be changed
       positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class, which some values will be changed
       vec: either the tfidfVectorize build from tfidf model or the CountVectorizer build from Bag of word model.
return: negative_word_change_scale: The modified negative dict
        positive_word_change_scale: The modified positive dict
        model: The modified naive bayes model
'''

def change_weight(word_change_scale, model, negative_word_log_prob_dict, positive_word_log_prob_dict, vec):
    for word, scale in word_change_scale.items():
        # change the weight of words in negative and positive word:log_prob dict
        negative_word_log_prob_dict[word] *= scale
        positive_word_log_prob_dict[word] *= scale

        # change the weight of words in the model
        index_in_model = vec.vocabulary_[word]
        model.feature_log_prob_[0][index_in_model] *= scale
        model.feature_log_prob_[1][index_in_model] *= scale

    return negative_word_log_prob_dict, positive_word_log_prob_dict, model

In [8]:
# Page 3 count wrong by value

def calculate_reweight_dict(positive_prob_dict, negative_prob_dict, dataset, threshold=0.1):
    wrongly_classified_token_dict = {} # counts how many times a token has negative impact on wrongly classified sentence
    token_dict = {} # counts how many times a token has appeared in total
    reweight_dict = {}
    for sentence in dataset:
        real_label = sentence[6:14]
        tokens = sentence[17:].split(" ")
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = 1
            else:
                token_dict[token] += 1
        if real_label == "positive": # marked as negative
            for token in tokens:
                if token not in positive_prob_dict: continue
                elif positive_prob_dict[token] < negative_prob_dict[token]:
                    if token not in wrongly_classified_token_dict:
                        wrongly_classified_token_dict[token] = 1
                    else:
                        wrongly_classified_token_dict[token] += 1
        elif real_label == "negative": # marked as positive
            for token in tokens:
                if token not in positive_prob_dict: continue
                elif negative_prob_dict[token] < positive_prob_dict[token]:
                    if token not in wrongly_classified_token_dict:
                        wrongly_classified_token_dict[token] = 1
                    else:
                        wrongly_classified_token_dict[token] += 1
    # print(wrongly_classified_token_dict)
    for word, prob in wrongly_classified_token_dict.items():
        # print(word, prob)
        weight = 0
        if positive_prob_dict[word] < negative_prob_dict[word]:
            weight = positive_prob_dict[word] - negative_prob_dict[word]
        elif negative_prob_dict[word] < positive_prob_dict[word]:
            weight = negative_prob_dict[word] - positive_prob_dict[word]
        multiplier = 1 - wrongly_classified_token_dict[word] / token_dict[word]
        if multiplier == 0:
            reweight_dict[word] = 0
        else:
            reweight_dict[word] = np.exp(weight * multiplier)
    return reweight_dict
    
    

#### Count correct by value

In [9]:
# Page 4 count correct by value

def calc_cctd(positive_prob_dict, negative_prob_dict, dataset):
    correct_classified_token_dict = {}
    token_dict = {}
    reweight_dict = {}
    for sentence in dataset:
        real_label = sentence[6:14]
        tokens = sentence[17:].split(" ")
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = 1
            else:
                token_dict[token] += 1
        if real_label == "positive": # true positive
            for token in tokens:
                if token not in positive_prob_dict: continue
                elif positive_prob_dict[token] > negative_prob_dict[token]:
                    if token not in correct_classified_token_dict:
                        correct_classified_token_dict[token] = 1
                    else:
                        correct_classified_token_dict[token] += 1
        elif real_label == "negative": # true negative
            for token in tokens:
                if token not in positive_prob_dict: continue
                elif negative_prob_dict[token] > positive_prob_dict[token]:
                    if token not in correct_classified_token_dict:
                        correct_classified_token_dict[token] = 1
                    else:
                        correct_classified_token_dict[token] += 1

    correct_classified_token_dict = {k: v for k, v in sorted(
        correct_classified_token_dict.items(), key=lambda item: item[1], reverse=True)}

    # print(correct_classified_token_dict)
    for word, prob in correct_classified_token_dict.items():
        # print(word, prob)
        weight = 0
        if positive_prob_dict[word] > negative_prob_dict[word]:
            weight = positive_prob_dict[word] - negative_prob_dict[word]
        elif negative_prob_dict[word] > positive_prob_dict[word]:
            weight = negative_prob_dict[word] - positive_prob_dict[word]
        multiplier = 1 - correct_classified_token_dict[word] / token_dict[word]
        if multiplier == 0:
            reweight_dict[word] = 0
        else:
            reweight_dict[word] = np.exp(weight * multiplier)
    
    reweight_dict = {k: v for k, v in sorted(
        reweight_dict.items(), key=lambda item: item[1], reverse=True)}
    return reweight_dict
    

#### Train

In [10]:
neg, pos, mnb, transformed_test_reviews, vec= generate_log_prob(train_reviews, train_sentiments, alpha = 0.05)

In [11]:
len(vec.vocabulary_.items())

92887

In [12]:
pos

{'one': -5.333685171423538,
 'of': -3.6198354732420626,
 'the': -2.8096494809026176,
 'other': -6.386320942398431,
 'reviewers': -10.153959865789862,
 'has': -5.758669553214041,
 'mentioned': -9.335641189068905,
 'that': -4.399652213313702,
 'after': -6.637783506604633,
 'watching': -7.296908749939966,
 'just': -6.004901047316345,
 'oz': -10.098712456598292,
 'episode': -8.01682161912439,
 'you': -5.131895294940147,
 'll': -7.623052945025863,
 'be': -5.4397073285388675,
 'hooked': -10.17114734130957,
 'they': -5.678907422463478,
 'are': -5.271095548964395,
 'right': -7.43450097887549,
 'as': -4.707138029132709,
 'this': -4.3999942933979845,
 'is': -3.923840205331846,
 'exactly': -8.707258090730683,
 'what': -5.9141401153374655,
 'happened': -8.741249535330123,
 'with': -4.816294808809797,
 'me': -6.320667828155354,
 'br': -4.055313897267334,
 'first': -6.428680563292813,
 'thing': -7.407537701136814,
 'struck': -10.368548803090725,
 'about': -5.847184998307,
 'was': -4.870462078333148,

In [13]:
neg

{'one': -5.362672742166099,
 'of': -3.705237717254226,
 'the': -2.8382086413399765,
 'other': -6.4681956528204605,
 'reviewers': -9.929812568216615,
 'has': -5.919109222826936,
 'mentioned': -9.18514751586239,
 'that': -4.318732067464774,
 'after': -6.599477379083632,
 'watching': -6.95173112677573,
 'just': -5.579435061125974,
 'oz': -11.032579987516685,
 'episode': -8.441913873209078,
 'you': -5.053402625186582,
 'll': -7.495924433143283,
 'be': -5.261398323286592,
 'hooked': -11.283696160444048,
 'they': -5.365596363497501,
 'are': -5.244331782101574,
 'right': -7.447039037337252,
 'as': -4.923407972415873,
 'this': -4.2269819401107345,
 'is': -4.0249944461676765,
 'exactly': -8.561122147541298,
 'what': -5.790054862271154,
 'happened': -8.511944094972819,
 'with': -4.89527569542175,
 'me': -6.194344777625229,
 'br': -3.9814059418073526,
 'first': -6.511025273010345,
 'thing': -6.885070762491521,
 'struck': -10.927288870519494,
 'about': -5.754518439073513,
 'was': -4.67353093062366

In [14]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)


In [15]:
mnb.predict_proba(transformed_test_reviews[0]), mnb.predict_log_proba(transformed_test_reviews[0])

(array([[1.00000000e+00, 5.15735135e-17]]),
 array([[  0.        , -37.50352344]]))

In [16]:
mnb.predict(transformed_test_reviews[0]), mnb.predict(transformed_test_reviews[1])

(array(['negative'], dtype='<U8'), array(['negative'], dtype='<U8'))

In [17]:
mnb.classes_, mnb.class_log_prior_

(array(['negative', 'positive'], dtype='<U8'),
 array([-0.69279724, -0.69349724]))

In [18]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)


mnb_bow_score : 0.8488


#### Classification report

In [19]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)

              precision    recall  f1-score   support

    Positive       0.83      0.88      0.85      4993
    Negative       0.87      0.82      0.84      5007

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [20]:
correct_labeled_NB_BOW = ""

for i in range(mnb_bow_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_bow_predict[i] == test_sentiments[start_index + i]):
        correct_labeled_NB_BOW += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("Naive Bayes - Bag of Words Correctly labeled sentences.txt", "w", encoding="utf-8")
f.write(correct_labeled_NB_BOW)
f.close()

#### Use reweight(correct by value)

In [21]:
f = open("Naive Bayes - Bag of Words Correctly labeled sentences.txt", encoding="utf-8")
dataset = []
for line in f:
    pattern = r'[^A-Za-z0-9]+'
    line = re.sub(pattern, " ", line.lower())
    dataset.append(line)

reweight_dict = calc_cctd(pos, neg, dataset)
reweight_dict

{'macha': 47.374407730259755,
 'hussein': 44.69951147848365,
 'acre': 37.30069555146994,
 'danton': 35.38415961928481,
 'ungodly': 34.653370939844436,
 'clubbed': 32.31472757756017,
 'parkinson': 32.214964760413096,
 'primordial': 31.508258515325913,
 'snickering': 29.888126991263903,
 'inspirations': 29.304600256409252,
 'newbern': 27.35862022759071,
 'resurrecting': 27.35862022759071,
 'crossword': 27.35862022759071,
 'ria': 27.301115634153952,
 'plural': 27.301115634153952,
 'regan': 26.824478816313892,
 'meanness': 26.824478816313892,
 'stressing': 26.70220549844544,
 'dramatizations': 26.70220549844544,
 'flemish': 24.70614746814137,
 'ge': 24.428167162324666,
 'rivalries': 24.223792131283528,
 'gravedigger': 22.83580458830561,
 'frightful': 22.07059322602099,
 'tremble': 22.07059322602099,
 'shortness': 22.07059322602099,
 'walkman': 21.902601334813905,
 'somersaults': 21.902601334813905,
 'bleibtreu': 21.586426125991295,
 'scissorhands': 21.586426125991295,
 'reliefs': 21.474981

In [22]:
import itertools
from scipy.interpolate import interp1d

reweight_dict1 = dict(itertools.islice(reweight_dict.items(), 100))

m = interp1d([list(reweight_dict1.values())[99],list(reweight_dict1.values())[0]],[1,2])

for key, value in reweight_dict1.items():
    reweight_dict1[key] = float(m(value))
    # reweight_dict1[key] = 2

reweight_dict1

{'macha': 2.0,
 'hussein': 1.924864254745529,
 'acre': 1.7170372976036572,
 'danton': 1.663203301060394,
 'ungodly': 1.6426760181735065,
 'clubbed': 1.5769853537998968,
 'parkinson': 1.5741830942318653,
 'primordial': 1.554332268105298,
 'snickering': 1.508824039743232,
 'inspirations': 1.4924332297930887,
 'newbern': 1.4377721715862464,
 'resurrecting': 1.4377721715862464,
 'crossword': 1.4377721715862464,
 'ria': 1.4361569124970173,
 'plural': 1.4361569124970173,
 'regan': 1.422768556780285,
 'meanness': 1.422768556780285,
 'stressing': 1.4193339948391692,
 'dramatizations': 1.4193339948391692,
 'flemish': 1.3632662847070527,
 'ge': 1.3554580351610241,
 'rivalries': 1.34971730026019,
 'gravedigger': 1.3107298149039552,
 'frightful': 1.2892356257576405,
 'tremble': 1.2892356257576405,
 'shortness': 1.2892356257576405,
 'walkman': 1.2845168648209606,
 'somersaults': 1.2845168648209606,
 'bleibtreu': 1.2756357502991915,
 'scissorhands': 1.2756357502991915,
 'reliefs': 1.2725053601857714

In [23]:

neg, pos, mnb = change_weight(reweight_dict, mnb, neg, pos, vec)

In [24]:
mnb_tfidf_predict = mnb.predict(transformed_test_reviews)
mnb_tfidf_score = accuracy_score(test_sentiments, mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_tfidf_score : 0.7792


In [25]:
#Classification report for TF-IDF
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.76      0.81      0.78      4993
    Negative       0.80      0.75      0.77      5007

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000



#### Use reweight (wrong by value)

In [26]:
f = open("Naive Bayes - Bag of Words Wrongly labeled sentences.txt", encoding="utf-8")
dataset = []
for line in f:
    pattern = r'[^A-Za-z0-9]+'
    line = re.sub(pattern, " ", line.lower())
    dataset.append(line)
reweight_dict = calculate_reweight_dict(pos, neg, dataset)
reweight_dict

{'movie': 0.9339850175014867,
 'stuffed': 0.7181255339422751,
 'stock': 0.6890431128837246,
 'horror': 0.9615346101227336,
 'goodies': 0,
 'chained': 0.9425008516689314,
 'meditated': 0,
 'mad': 0.989682435876356,
 'vaguely': 0,
 'lesbian': 0.7966418627494021,
 'female': 0.8379295711499458,
 'scientist': 0.8578390160688382,
 'even': 0.8435691488915968,
 'wears': 0.8042197614924795,
 'mask': 0,
 'because': 0.9393811153358029,
 'horrible': 0.7053047102836953,
 'disfigurement': 0,
 'werewolves': 0,
 'male': 0.8697034025608927,
 'mystics': 0,
 'half': 0.8619445692510281,
 'victim': 0.9616193442863747,
 'some': 0.947824329259358,
 'experiment': 0.924639670505878,
 'grave': 0.8638663281918609,
 'up': 0.9590570295601268,
 'bodies': 0.8840411684374406,
 'car': 0.897049128180949,
 'crash': 0,
 'on': 0.9888244114998532,
 'all': 0.9857806251612996,
 'off': 0.8606053617584719,
 'incredibly': 0.8905139252834435,
 'awful': 0.4425503510167507,
 'worst': 0.5385085147728386,
 'foley': 0,
 'ever': 0.958