In [69]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import math



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
imdb_data = pd.read_csv('./test_Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,just plain boring,negative
1,entirely predictable and lacks energy,negative
2,no surprises and very few laughs,negative
3,very powerful,positive
4,the most fun film of the summer,positive


In [71]:
imdb_data['sentiment'].value_counts()

negative    3
positive    3
Name: sentiment, dtype: int64

In [72]:
train_reviews=imdb_data.review[:5]
train_sentiments=imdb_data.sentiment[:5]

test_reviews=imdb_data.review[5:]
test_sentiments=imdb_data.sentiment[5:]

train_reviews.shape,train_sentiments.shape, test_reviews.shape,test_sentiments.shape

((5,), (5,), (1,), (1,))

In [73]:
train_sentiments.value_counts()

negative    3
positive    2
Name: sentiment, dtype: int64

In [74]:
'''
Method that take in the training dataset, then return the positive and negative words log probability.
Input: train_reviews: reviews (sentences) for training
       train_sentiments: sentiments (label) for training
       tfidf: boolean variable indicating whether using bow or tfidf
       alpha: laplance smoothing variable, default to be 1.0
       ngram_range: the scale of ngram model will be used, default = (1,1) unigram
return: negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class
        positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class
        mnb: the trained multinomial naive bayes model, later can be used for testing
        transformed_test_reviews: transformed test reviews that later can be used for testing
'''

def generate_log_prob(train_reviews, train_sentiments, tfidf=False, alpha=1.0, ngram_range = (1,1)):

    if (tfidf):
        #Tfidf vectorizer
        vec=TfidfVectorizer(use_idf=tfidf, ngram_range=ngram_range)
        #transformed train reviews
        transformed_train_reviews=vec.fit_transform(train_reviews)
        #transformed test reviews
        transformed_test_reviews=vec.transform(test_reviews)
    else:
        vec=CountVectorizer(ngram_range=(1,1))
        transformed_train_reviews=vec.fit_transform(train_reviews)
        transformed_test_reviews=vec.transform(test_reviews)

    #training the model
    mnb = MultinomialNB(alpha=alpha)

    #fitting the naive bayes for bag of words
    mnb = mnb.fit(transformed_train_reviews, train_sentiments)
    negative_log_prob = mnb.feature_log_prob_[0]
    positive_log_prob = mnb.feature_log_prob_[1]

    # Generate two dict: word:log_prob
    negative_word_log_prob_dict = {}
    positive_word_log_prob_dict = {}
    for word, index in vec.vocabulary_.items():
        negative_word_log_prob_dict[word] =   negative_log_prob[index]
        positive_word_log_prob_dict[word] = positive_log_prob[index]
    
    return negative_word_log_prob_dict, positive_word_log_prob_dict, mnb, transformed_test_reviews, vec

In [None]:
# Page 3 count wrong by value

def func(positive_prob_dict, negative_prob_dict, dataset, threshold=0.1):
    wrongly_classified_token_dict = {}
    reweight_dict = {}
    for sentence in dataset:
        real_label = sentence[6:14]
        tokens = sentence[17:].split(" ")
        for token in tokens:
            if token not in wrongly_classified_token_dict:
                wrongly_classified_token_dict[token] = 0
        if real_label == "positive": # marked as negative
            for token in tokens:
                wrongly_classified_token_dict[token] += positive_prob_dict[token] - negative_prob_dict[token]
        elif real_label == "negative": # marked as positive
            for token in tokens:
                wrongly_classified_token_dict[token] += negative_prob_dict[token] - positive_prob_dict[token]
    
    for word, prob in wrongly_classified_token_dict:
        if wrongly_classified_token_dict[word] >= threshold * len(dataset):
            reweight_dict[word] = 2
    


    

In [75]:
neg, pos, mnb, transformed_test_reviews, vec= generate_log_prob(train_reviews, train_sentiments, alpha = 1)

In [76]:
neg

{'just': -2.833213344056216,
 'plain': -2.833213344056216,
 'boring': -2.833213344056216,
 'entirely': -2.833213344056216,
 'predictable': -2.833213344056216,
 'and': -2.4277482359480516,
 'lacks': -2.833213344056216,
 'energy': -2.833213344056216,
 'no': -2.833213344056216,
 'surprises': -2.833213344056216,
 'very': -2.833213344056216,
 'few': -2.833213344056216,
 'laughs': -2.833213344056216,
 'powerful': -3.5263605246161616,
 'the': -3.5263605246161616,
 'most': -3.5263605246161616,
 'fun': -3.5263605246161616,
 'film': -3.5263605246161616,
 'of': -3.5263605246161616,
 'summer': -3.5263605246161616}

In [77]:
pos

{'just': -3.367295829986474,
 'plain': -3.367295829986474,
 'boring': -3.367295829986474,
 'entirely': -3.367295829986474,
 'predictable': -3.367295829986474,
 'and': -3.367295829986474,
 'lacks': -3.367295829986474,
 'energy': -3.367295829986474,
 'no': -3.367295829986474,
 'surprises': -3.367295829986474,
 'very': -2.6741486494265287,
 'few': -3.367295829986474,
 'laughs': -3.367295829986474,
 'powerful': -2.6741486494265287,
 'the': -2.2686835413183646,
 'most': -2.6741486494265287,
 'fun': -2.6741486494265287,
 'film': -2.6741486494265287,
 'of': -2.6741486494265287,
 'summer': -2.6741486494265287}

In [78]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)
math.e ** mnb.predict_log_proba(transformed_test_reviews[0]) 

array([[0.65054103, 0.34945897]])

In [79]:
mnb.predict_proba(transformed_test_reviews[0]) 

array([[0.65054103, 0.34945897]])

In [80]:
mnb.predict(transformed_test_reviews[0])

array(['negative'], dtype='<U8')

In [81]:
mnb.classes_, math.e ** mnb.class_log_prior_

(array(['negative', 'positive'], dtype='<U8'), array([0.6, 0.4]))

In [82]:
'''
This method will take in a word:scale dict, then take in the negative and positive word:log_probability dict, manually change the weight of the words in the model and the dict

'''

def change_weight(negative_word_change_scale, positive_word_change_scale, model, negative_word_log_prob_dict, positive_word_log_prob_dict, vec):
    for word, scale in negative_word_change_scale.items():
        # change the weight of words in negative and positive word:log_prob dict
        negative_word_log_prob_dict[word] += np.log(scale)

        # change the weight of words in the model
        index_in_model = vec.vocabulary_[word]
        model.feature_log_prob_[0][index_in_model] += np.log(scale)

    for word, scale in positive_word_change_scale.items():
        # change the weight of words in negative and positive word:log_prob dict
        positive_word_log_prob_dict[word] += np.log(scale)

        # change the weight of words in the model
        index_in_model = vec.vocabulary_[word]
        model.feature_log_prob_[1][index_in_model] += np.log(scale)

    return negative_word_log_prob_dict, positive_word_log_prob_dict, model


In [83]:
neg_prob = {}
for word, prob in neg.items():
    neg_prob[word] = math.e ** neg[word]

In [84]:
pos_prob = {}
for word, prob in pos.items():
    pos_prob[word] = math.e ** pos[word]

In [85]:
index = vec.vocabulary_["just"]
mnb.feature_log_prob_[0][index]

-2.833213344056216

In [86]:
neg

{'just': -2.833213344056216,
 'plain': -2.833213344056216,
 'boring': -2.833213344056216,
 'entirely': -2.833213344056216,
 'predictable': -2.833213344056216,
 'and': -2.4277482359480516,
 'lacks': -2.833213344056216,
 'energy': -2.833213344056216,
 'no': -2.833213344056216,
 'surprises': -2.833213344056216,
 'very': -2.833213344056216,
 'few': -2.833213344056216,
 'laughs': -2.833213344056216,
 'powerful': -3.5263605246161616,
 'the': -3.5263605246161616,
 'most': -3.5263605246161616,
 'fun': -3.5263605246161616,
 'film': -3.5263605246161616,
 'of': -3.5263605246161616,
 'summer': -3.5263605246161616}

In [87]:
negative_word_change_scale = {}
positive_word_change_scale = {}
negative_word_change_scale["just"] = 0.5
positive_word_change_scale["just"] = 2

neg, pos, model = change_weight(negative_word_change_scale, positive_word_change_scale, mnb, neg, pos, vec)

In [88]:
index = vec.vocabulary_["just"]
mnb.feature_log_prob_[1][index] - np.log(2)

-3.367295829986474

In [89]:
pos

{'just': -2.6741486494265287,
 'plain': -3.367295829986474,
 'boring': -3.367295829986474,
 'entirely': -3.367295829986474,
 'predictable': -3.367295829986474,
 'and': -3.367295829986474,
 'lacks': -3.367295829986474,
 'energy': -3.367295829986474,
 'no': -3.367295829986474,
 'surprises': -3.367295829986474,
 'very': -2.6741486494265287,
 'few': -3.367295829986474,
 'laughs': -3.367295829986474,
 'powerful': -2.6741486494265287,
 'the': -2.2686835413183646,
 'most': -2.6741486494265287,
 'fun': -2.6741486494265287,
 'film': -2.6741486494265287,
 'of': -2.6741486494265287,
 'summer': -2.6741486494265287}

In [90]:
# for word, prob in neg.items():
#     neg[word] = math.e ** neg[word]

In [91]:
# for word, prob in pos.items():
#     pos[word] = math.e ** pos[word]

In [92]:
neg_prob

{'just': 0.05882352941176471,
 'plain': 0.05882352941176471,
 'boring': 0.05882352941176471,
 'entirely': 0.05882352941176471,
 'predictable': 0.05882352941176471,
 'and': 0.08823529411764708,
 'lacks': 0.05882352941176471,
 'energy': 0.05882352941176471,
 'no': 0.05882352941176471,
 'surprises': 0.05882352941176471,
 'very': 0.05882352941176471,
 'few': 0.05882352941176471,
 'laughs': 0.05882352941176471,
 'powerful': 0.029411764705882353,
 'the': 0.029411764705882353,
 'most': 0.029411764705882353,
 'fun': 0.029411764705882353,
 'film': 0.029411764705882353,
 'of': 0.029411764705882353,
 'summer': 0.029411764705882353}

In [93]:
neg

{'just': -3.5263605246161616,
 'plain': -2.833213344056216,
 'boring': -2.833213344056216,
 'entirely': -2.833213344056216,
 'predictable': -2.833213344056216,
 'and': -2.4277482359480516,
 'lacks': -2.833213344056216,
 'energy': -2.833213344056216,
 'no': -2.833213344056216,
 'surprises': -2.833213344056216,
 'very': -2.833213344056216,
 'few': -2.833213344056216,
 'laughs': -2.833213344056216,
 'powerful': -3.5263605246161616,
 'the': -3.5263605246161616,
 'most': -3.5263605246161616,
 'fun': -3.5263605246161616,
 'film': -3.5263605246161616,
 'of': -3.5263605246161616,
 'summer': -3.5263605246161616}

In [94]:
pos_prob

{'just': 0.034482758620689655,
 'plain': 0.034482758620689655,
 'boring': 0.034482758620689655,
 'entirely': 0.034482758620689655,
 'predictable': 0.034482758620689655,
 'and': 0.034482758620689655,
 'lacks': 0.034482758620689655,
 'energy': 0.034482758620689655,
 'no': 0.034482758620689655,
 'surprises': 0.034482758620689655,
 'very': 0.06896551724137932,
 'few': 0.034482758620689655,
 'laughs': 0.034482758620689655,
 'powerful': 0.06896551724137932,
 'the': 0.10344827586206895,
 'most': 0.06896551724137932,
 'fun': 0.06896551724137932,
 'film': 0.06896551724137932,
 'of': 0.06896551724137932,
 'summer': 0.06896551724137932}

In [95]:
pos

{'just': -2.6741486494265287,
 'plain': -3.367295829986474,
 'boring': -3.367295829986474,
 'entirely': -3.367295829986474,
 'predictable': -3.367295829986474,
 'and': -3.367295829986474,
 'lacks': -3.367295829986474,
 'energy': -3.367295829986474,
 'no': -3.367295829986474,
 'surprises': -3.367295829986474,
 'very': -2.6741486494265287,
 'few': -3.367295829986474,
 'laughs': -3.367295829986474,
 'powerful': -2.6741486494265287,
 'the': -2.2686835413183646,
 'most': -2.6741486494265287,
 'fun': -2.6741486494265287,
 'film': -2.6741486494265287,
 'of': -2.6741486494265287,
 'summer': -2.6741486494265287}