In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import the dataset

In [2]:
imdb_data = pd.read_csv('./IMDB_Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### Generate the Training and Testing data by dividing the dataset as 4:1

In [4]:
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]

test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]

train_reviews.shape,train_sentiments.shape, test_reviews.shape,test_sentiments.shape


((40000,), (40000,), (10000,), (10000,))

### Make sure that the split is balanced in general

In [5]:
train_sentiments.value_counts()

negative    20007
positive    19993
Name: sentiment, dtype: int64

### Preprocessing the dataset

In [6]:
# tokenizer = ToktokTokenizer()
# stopword_list=nltk.corpus.stopwords.words('english')

# #Stemming the text, e.g. am, are, is -> be
# def simple_stemmer(text):
#     ps=nltk.porter.PorterStemmer()
#     text= ' '.join([ps.stem(word) for word in text.split()])
#     return text

# train_reviews = train_reviews.apply(simple_stemmer)
# test_reviews = test_reviews.apply(simple_stemmer)

In [7]:
# set stopwords to english
# stop = set(stopwords.words('english'))
# print(stop)

In [8]:
# #removing the stopwords
# def remove_stopwords(text, is_lower_case=False):
#     tokens = tokenizer.tokenize(text)
#     tokens = [token.strip() for token in tokens]
#     if is_lower_case:
#         filtered_tokens = [token for token in tokens if token not in stopword_list]
#     else:
#         filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
#     filtered_text = ' '.join(filtered_tokens)    
#     return filtered_text

# train_reviews = train_reviews.apply(remove_stopwords)
# test_reviews = test_reviews.apply(remove_stopwords)


如果在max_df中超过百分之50的一个词其中大部分都在positive，positive indicator？

 # Naive bayes model
 https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

## Method that take in the training dataset, then return the positive and negative words-log probability dictionary.


In [9]:
'''
Method that take in the training dataset, then return the positive and negative words log probability.
Input: train_reviews: reviews (sentences) for training
       train_sentiments: sentiments (label) for training
       tfidf: boolean variable indicating whether using bow or tfidf
       alpha: laplance smoothing variable, default to be 1.0
       ngram_range: the scale of ngram model will be used, default = (1,1) unigram
return: negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class
        positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class
        mnb: the trained multinomial naive bayes model, later can be used for testing
        transformed_test_reviews: transformed test reviews that later can be used for testing
        vec: either the tfidfVectorize build from tfidf model or the CountVectorizer build from Bag of word model.
'''

def generate_log_prob(train_reviews, train_sentiments, tfidf=False, alpha=1.0, ngram_range = (1,1)):

    if (tfidf):
        #Tfidf vectorizer
        vec=TfidfVectorizer(use_idf=tfidf, ngram_range=ngram_range)
        #transformed train reviews
        transformed_train_reviews=vec.fit_transform(train_reviews)
        #transformed test reviews
        transformed_test_reviews=vec.transform(test_reviews)
    else:
        vec=CountVectorizer(ngram_range=(1,1))
        transformed_train_reviews=vec.fit_transform(train_reviews)
        transformed_test_reviews=vec.transform(test_reviews)

    #training the model
    mnb = MultinomialNB(alpha=alpha)

    #fitting the naive bayes for bag of words
    mnb = mnb.fit(transformed_train_reviews, train_sentiments)
    negative_log_prob = mnb.feature_log_prob_[0]
    positive_log_prob = mnb.feature_log_prob_[1]

    # Generate two dict: word:log_prob
    negative_word_log_prob_dict = {}
    positive_word_log_prob_dict = {}
    for word, index in vec.vocabulary_.items():
        negative_word_log_prob_dict[word] = negative_log_prob[index]
        positive_word_log_prob_dict[word] = positive_log_prob[index]
    
    return negative_word_log_prob_dict, positive_word_log_prob_dict, mnb, transformed_test_reviews, vec

## Method that given input word, manually change the weight of the word in naive bayes model

In [10]:
'''
This method will take in a word:scale dict, then take in the negative and positive word:log_probability dict, manually change the weight of the words in the model and the dict
Input: word_change_scale: this is the word-scale dictionary, how much the weight of the word should be changed, For example if the value is 0.5, we will say 
                          the probability of the word in negative class should multiply 0.5, in original probability, we take power to the scale
       model: the trained naive bayes model, which the feature_log_prob_ attribute will be manually changed based on previous two params
       negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class, which some values will be changed
       positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class, which some values will be changed
       vec: either the tfidfVectorize build from tfidf model or the CountVectorizer build from Bag of word model.
return: negative_word_change_scale: The modified negative dict
        positive_word_change_scale: The modified positive dict
        model: The modified naive bayes model
'''

def change_weight(word_change_scale, model, negative_word_log_prob_dict, positive_word_log_prob_dict, vec):
    for word, scale in word_change_scale.items():
        # change the weight of words in negative and positive word:log_prob dict
        negative_word_log_prob_dict[word] *= scale
        positive_word_log_prob_dict[word] *= scale

        # change the weight of words in the model
        index_in_model = vec.vocabulary_[word]
        model.feature_log_prob_[0][index_in_model] *= scale
        model.feature_log_prob_[1][index_in_model] *= scale

    return negative_word_log_prob_dict, positive_word_log_prob_dict, model

### Predict the result using bag of word methods

In [11]:
neg, pos, mnb, transformed_test_reviews, vec= generate_log_prob(train_reviews, train_sentiments, alpha = 0.05)

This block represent the length of the vocabulary

In [12]:
len(vec.vocabulary_.items())

92887

Following two blocks shows the word:log_prob key value pair for each phrase in the vocabulary

In [13]:
pos

{'one': -5.333685171423538,
 'of': -3.6198354732420626,
 'the': -2.8096494809026176,
 'other': -6.386320942398431,
 'reviewers': -10.153959865789862,
 'has': -5.758669553214041,
 'mentioned': -9.335641189068905,
 'that': -4.399652213313702,
 'after': -6.637783506604633,
 'watching': -7.296908749939966,
 'just': -6.004901047316345,
 'oz': -10.098712456598292,
 'episode': -8.01682161912439,
 'you': -5.131895294940147,
 'll': -7.623052945025863,
 'be': -5.4397073285388675,
 'hooked': -10.17114734130957,
 'they': -5.678907422463478,
 'are': -5.271095548964395,
 'right': -7.43450097887549,
 'as': -4.707138029132709,
 'this': -4.3999942933979845,
 'is': -3.923840205331846,
 'exactly': -8.707258090730683,
 'what': -5.9141401153374655,
 'happened': -8.741249535330123,
 'with': -4.816294808809797,
 'me': -6.320667828155354,
 'br': -4.055313897267334,
 'first': -6.428680563292813,
 'thing': -7.407537701136814,
 'struck': -10.368548803090725,
 'about': -5.847184998307,
 'was': -4.870462078333148,

In [14]:
neg

{'one': -5.362672742166099,
 'of': -3.705237717254226,
 'the': -2.8382086413399765,
 'other': -6.4681956528204605,
 'reviewers': -9.929812568216615,
 'has': -5.919109222826936,
 'mentioned': -9.18514751586239,
 'that': -4.318732067464774,
 'after': -6.599477379083632,
 'watching': -6.95173112677573,
 'just': -5.579435061125974,
 'oz': -11.032579987516685,
 'episode': -8.441913873209078,
 'you': -5.053402625186582,
 'll': -7.495924433143283,
 'be': -5.261398323286592,
 'hooked': -11.283696160444048,
 'they': -5.365596363497501,
 'are': -5.244331782101574,
 'right': -7.447039037337252,
 'as': -4.923407972415873,
 'this': -4.2269819401107345,
 'is': -4.0249944461676765,
 'exactly': -8.561122147541298,
 'what': -5.790054862271154,
 'happened': -8.511944094972819,
 'with': -4.89527569542175,
 'me': -6.194344777625229,
 'br': -3.9814059418073526,
 'first': -6.511025273010345,
 'thing': -6.885070762491521,
 'struck': -10.927288870519494,
 'about': -5.754518439073513,
 'was': -4.67353093062366

Following block use the trained model to do the prediction

In [15]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)


To check the corresponding test reviews log probability and original probability.


NOTICE: this probability was being normalized across all classes, so the log_probability was not the summation of log class probability and log probability of all phrases in the sentence. You can see that the summation of predict_proba() method is 1. The predict_log_proba() method calculate the log probability based on the normalized one.

In [16]:
mnb.predict_proba(transformed_test_reviews[0]), mnb.predict_log_proba(transformed_test_reviews[0])

(array([[1.00000000e+00, 5.15735135e-17]]),
 array([[  0.        , -37.50352344]]))

This predict() method will return the predict value of given transformed review

In [17]:
mnb.predict(transformed_test_reviews[0]), mnb.predict(transformed_test_reviews[1])

(array(['negative'], dtype='<U8'), array(['negative'], dtype='<U8'))

This class_log_prior_ attribute of the MultinomialNB() model gives the prior class probability. In the order of the classes_ attribute

In [18]:
mnb.classes_, mnb.class_log_prior_

(array(['negative', 'positive'], dtype='<U8'),
 array([-0.69279724, -0.69349724]))

Now we put the entire training set into the trained Naive Bayes model, and ge the score

In [19]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)


mnb_bow_score : 0.8488


Classification report

In [20]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)

              precision    recall  f1-score   support

    Positive       0.83      0.88      0.85      4993
    Negative       0.87      0.82      0.84      5007

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [21]:
wrong_labeled_NB_BOW = ""

for i in range(mnb_bow_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_bow_predict[i] != test_sentiments[start_index + i]):
        wrong_labeled_NB_BOW += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("Naive Bayes - Bag of Words Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_NB_BOW)
f.close()

### Predict the model using TF-IDF method

In [22]:
neg, pos, mnb, transformed_test_reviews, vec = generate_log_prob(train_reviews, train_sentiments, tfidf = True, alpha = 0.05)

In [23]:
mnb_tfidf_predict = mnb.predict(transformed_test_reviews)
mnb_tfidf_score = accuracy_score(test_sentiments, mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)


mnb_tfidf_score : 0.8623


In [24]:
#Classification report for TF-IDF
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.85      0.87      0.86      4993
    Negative       0.87      0.85      0.86      5007

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [25]:
wrong_labeled_NB_TFIDF = ""
for i in range(mnb_tfidf_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_tfidf_predict[i] != test_sentiments[start_index + i]):
        wrong_labeled_NB_TFIDF += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("Naive Bayes - TF-IDF Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_NB_TFIDF)
f.close()

# fastText method

In [26]:
import fasttext
test_reviews.index.stop

50000

In [27]:
# #preprocess the data so it can feed into fasttext
# def transform_instance(doc, label):
#     processed_text = []
#     #Prefix the index-ed label with __label__
#     for i in range(doc.index.start, doc.index.stop):
#         cur_row = "__label__" + label[i] + " " + doc[i]
#         processed_text.append(cur_row)
#     return processed_text

# training_text = transform_instance(train_reviews, train_sentiments)
# test_text = transform_instance(test_reviews, test_sentiments)

# # Put the training and test dataset into file, so that the fasttext model can read them.
# f = open("fasttext.train", "w")
# output = ""
# for text in training_text:
#     output += text
#     output += "\n"
# f.write(output)
# f.close()

# f = open("fasttext.test", "w")
# output = ""
# for text in test_text:
#     output += text
#     output += "\n"
# f.write(output)
# f.close()

In [28]:
model = fasttext.train_supervised(input = "fasttext.train", lr=0.1, epoch=45, wordNgrams=1)

In [29]:
model.save_model("model_movie.bin")

In [30]:
model.test("fasttext.test")


(10000, 0.8872, 0.8872)

In [31]:
# get the misclassified examples by fasttext algorithm
wrong_labeled_fasttext = ""
for i in range(test_reviews.size):
    start_index = test_sentiments.index.start
    if(model.predict(test_reviews[start_index + i])[0][0] != str('__label__' + test_sentiments[start_index + i])):
        wrong_labeled_fasttext += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("fastText Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_fasttext)
f.close


<function TextIOWrapper.close()>

# Fasttext manually modify the sentence embedding method

In [56]:
train_reviews
import torch
import torch.nn as nn

In [57]:
# def output_train_reviews_to_doc(train_reviews):
#     processed_doc = ""
#     for review in train_reviews:
#         processed_doc += review + "\n"
#     return processed_doc

# f = open("wordRepresentation.train", "w")
# f.write(output_train_reviews_to_doc(train_reviews))
# f.close()

In [58]:
model = fasttext.train_unsupervised("wordRepresentation.train")


In [59]:
len(model.words)

66511

In [60]:
model.get_word_vector("the")

array([ 0.14664552, -0.29104635, -0.07032287, -0.37329933, -0.11337025,
        0.43572164, -0.22441661,  0.27147275,  0.09118447,  0.04874698,
       -0.0404829 , -0.0686131 , -0.21637541, -0.02395459,  0.18106005,
       -0.20253116,  0.02315689, -0.35788366, -0.07311332,  0.03788741,
        0.31552774,  0.2118799 ,  0.1378474 ,  0.13186155, -0.13492858,
       -0.03948062,  0.3257345 , -0.18307404,  0.05143523, -0.10038086,
       -0.2705226 , -0.07179162, -0.3223201 , -0.08034267,  0.04449475,
       -0.07331786, -0.15339617, -0.16857836,  0.04257391,  0.22850811,
       -0.0332642 , -0.12644215,  0.15857434, -0.31591707,  0.07537498,
       -0.1818736 ,  0.07813412, -0.00420003, -0.17508355,  0.09263556,
        0.25608024,  0.09699171, -0.44609445,  0.01972334, -0.09092503,
       -0.01152563,  0.0238095 , -0.24305181,  0.212957  , -0.11295515,
        0.11501586, -0.0095752 ,  0.08285155, -0.23337784,  0.38656214,
        0.01765997, -0.17822821,  0.10957053,  0.08833213,  0.09

For each train and test review, use all the embeddings of words, calculate an average to represent the sentence

In [61]:
def transform_sentence_embedding(data, word_list):
    representations = []
    for sentence in data:
        nltk_tokens = nltk.word_tokenize(sentence)
        sum = 0
        count = 0
        for token in nltk_tokens:
            if token not in word_list:
                sum += model.get_word_vector(token)
                count += 1
        representations.append(sum / count)
    return representations



In [62]:
def transform_label(data):
    representations = []
    for label in data:
        if(label == 'negative'):
            representations.append(0)
        else:
            representations.append(1)
    return representations

In [63]:
word_list = []
fasttext_train_reviews = torch.Tensor(np.array(transform_sentence_embedding(train_reviews, word_list)))
fasttext_test_reviews = torch.Tensor(np.array(transform_sentence_embedding(test_reviews, word_list)))

In [64]:
fasttext_train_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(train_sentiments))).to(torch.int64)).to(torch.float)
fasttext_test_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(test_sentiments))).to(torch.int64)).to(torch.float)

In [65]:
fasttext_train_reviews.shape, fasttext_train_sentiments.shape, fasttext_test_reviews.shape, fasttext_test_sentiments.shape

(torch.Size([40000, 100]),
 torch.Size([40000, 2]),
 torch.Size([10000, 100]),
 torch.Size([10000, 2]))

In [66]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim) 

        # Non-linearity
        self.sigmoid_1 = nn.Sigmoid()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

        # Non-linearity
        self.sigmoid_2 = nn.Sigmoid()

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.sigmoid_1(out)

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)

        out = self.sigmoid_2(out)
        return out

In [68]:
input_dim = 100
hidden_dim = 100
output_dim = 2

learning_rate = 0.1
n_epochs = 40
batch_size = 32

net = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)  


In [69]:
net.train()
# Placeholder for loss
track_loss = []

print('Epoch', '\t', 'Loss train', '\t', 'Loss test')
for i in range(n_epochs):

    # shuffle the permutation, so that the batches each time are different
    shuffle_idx = np.random.permutation(len(fasttext_train_reviews))
    #split the tensor into smaller batches
    review_batches = torch.split(fasttext_train_reviews[shuffle_idx], batch_size)
    sentiment_batches = torch.split(fasttext_train_sentiments[shuffle_idx], batch_size)
    # Mini batches
    for j in range(len(review_batches)):
        review_batch = review_batches[j]

        sentiment_batch = sentiment_batches[j]
        
        #where we train and build the network
        output_train = net(review_batch)


        loss = criterion(output_train, sentiment_batch)
    
        # compute gradients
        optimizer.zero_grad()
        loss.backward()
    
        # update the weights
        optimizer.step()

    # Keep track of loss at each epoch
    track_loss += [float(loss)]

    loss_epoch = f'{i + 1} / {n_epochs}'
    with torch.no_grad():
        output_train = net(fasttext_train_reviews)
        loss_train = criterion(output_train, fasttext_train_sentiments)
        loss_epoch += f'\t {loss_train:.4f}'

        output_test = net(fasttext_test_reviews)
        loss_test = criterion(output_test, fasttext_test_sentiments)
        loss_epoch += f'\t\t {loss_test:.4f}'

    print(loss_epoch)

Epoch 	 Loss train 	 Loss test
1 / 40	 0.6941		 0.6940
2 / 40	 0.6923		 0.6924
3 / 40	 0.6911		 0.6912
4 / 40	 0.6899		 0.6899
5 / 40	 0.6868		 0.6868
6 / 40	 0.6823		 0.6825
7 / 40	 0.6754		 0.6756
8 / 40	 0.6659		 0.6663
9 / 40	 0.6528		 0.6534
10 / 40	 0.6359		 0.6367
11 / 40	 0.6186		 0.6196
12 / 40	 0.6026		 0.6038
13 / 40	 0.5850		 0.5862
14 / 40	 0.5698		 0.5713
15 / 40	 0.5551		 0.5568
16 / 40	 0.5432		 0.5450
17 / 40	 0.5324		 0.5343
18 / 40	 0.5230		 0.5250
19 / 40	 0.5136		 0.5157
20 / 40	 0.5063		 0.5085
21 / 40	 0.5001		 0.5024
22 / 40	 0.4954		 0.4976
23 / 40	 0.4905		 0.4929
24 / 40	 0.4858		 0.4881
25 / 40	 0.4823		 0.4847
26 / 40	 0.4788		 0.4811
27 / 40	 0.4761		 0.4785
28 / 40	 0.4739		 0.4764
29 / 40	 0.4708		 0.4732
30 / 40	 0.4688		 0.4712
31 / 40	 0.4667		 0.4691
32 / 40	 0.4659		 0.4681
33 / 40	 0.4632		 0.4656
34 / 40	 0.4632		 0.4657
35 / 40	 0.4606		 0.4630
36 / 40	 0.4603		 0.4628
37 / 40	 0.4579		 0.4603
38 / 40	 0.4568		 0.4592
39 / 40	 0.4557		 0.4581
40 

In [70]:
predict_val = net(fasttext_test_reviews)

In [71]:
predict_result = []
for (neg_prob, pos_prob) in predict_val:
    if (neg_prob > pos_prob):
        predict = "negative"
    else:
        predict = "positive"
    predict_result.append(predict)


In [72]:
predict_result

['negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',

In [73]:
ffnn_score = accuracy_score(test_sentiments, predict_result)
ffnn_score

0.8617