In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import the dataset

In [2]:
imdb_data = pd.read_csv('./IMDB_Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### Generate the Training and Testing data by dividing the dataset as 4:1

In [4]:
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]

test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]

train_reviews.shape,train_sentiments.shape, test_reviews.shape,test_sentiments.shape


((40000,), (40000,), (10000,), (10000,))

### Make sure that the split is balanced in general

In [5]:
train_sentiments.value_counts()

negative    20007
positive    19993
Name: sentiment, dtype: int64

### Preprocessing the dataset

In [6]:
# tokenizer = ToktokTokenizer()
# stopword_list=nltk.corpus.stopwords.words('english')

# #Stemming the text, e.g. am, are, is -> be
# def simple_stemmer(text):
#     ps=nltk.porter.PorterStemmer()
#     text= ' '.join([ps.stem(word) for word in text.split()])
#     return text

# train_reviews = train_reviews.apply(simple_stemmer)
# test_reviews = test_reviews.apply(simple_stemmer)

In [7]:
# set stopwords to english
# stop = set(stopwords.words('english'))
# print(stop)

In [8]:
# #removing the stopwords
# def remove_stopwords(text, is_lower_case=False):
#     tokens = tokenizer.tokenize(text)
#     tokens = [token.strip() for token in tokens]
#     if is_lower_case:
#         filtered_tokens = [token for token in tokens if token not in stopword_list]
#     else:
#         filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
#     filtered_text = ' '.join(filtered_tokens)    
#     return filtered_text

# train_reviews = train_reviews.apply(remove_stopwords)
# test_reviews = test_reviews.apply(remove_stopwords)


如果在max_df中超过百分之50的一个词其中大部分都在positive，positive indicator？

 # Naive bayes model
 https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

### build into method

In [9]:
'''
Method that take in the training dataset, then return the positive and negative words log probability.
Input: train_reviews: reviews (sentences) for training
       train_sentiments: sentiments (label) for training
       tfidf: boolean variable indicating whether using bow or tfidf
       alpha: laplance smoothing variable, default to be 1.0
       ngram_range: the scale of ngram model will be used, default = (1,1) unigram
return: negative_word_log_prob_dict: dictionary that contains the word:log probability pair for negative class
        positive_word_log_prob_dict: dictionary that contains the word:log probability pair for positive class
        mnb: the trained multinomial naive bayes model, later can be used for testing
        transformed_test_reviews: transformed test reviews that later can be used for testing
'''

def generate_log_prob(train_reviews, train_sentiments, tfidf=False, alpha=1.0, ngram_range = (1,1)):

    if (tfidf):
        #Tfidf vectorizer
        vec=TfidfVectorizer(min_df=2,max_df=0.5,use_idf=tfidf, ngram_range=ngram_range)
        #transformed train reviews
        transformed_train_reviews=vec.fit_transform(train_reviews)
        #transformed test reviews
        transformed_test_reviews=vec.transform(test_reviews)
    else:
        vec=CountVectorizer(min_df=2,max_df=0.5,ngram_range=(1,1))
        transformed_train_reviews=vec.fit_transform(train_reviews)
        transformed_test_reviews=vec.transform(test_reviews)

    #training the model
    mnb = MultinomialNB(alpha=alpha)

    #fitting the naive bayes for bag of words
    mnb = mnb.fit(transformed_train_reviews, train_sentiments)
    negative_log_prob = mnb.feature_log_prob_[0]
    positive_log_prob = mnb.feature_log_prob_[1]

    # Generate two dict: word:log_prob
    negative_word_log_prob_dict = {}
    positive_word_log_prob_dict = {}
    for word, index in vec.vocabulary_.items():
        negative_word_log_prob_dict[word] = negative_log_prob[index]
        positive_word_log_prob_dict[word] = positive_log_prob[index]
    
    return negative_word_log_prob_dict, positive_word_log_prob_dict, mnb, transformed_test_reviews

In [10]:
neg, pos, mnb, transformed_test_reviews = generate_log_prob(train_reviews, train_sentiments, alpha = 0.05)

In [11]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)


mnb_bow_score : 0.8532


In [12]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)

              precision    recall  f1-score   support

    Positive       0.84      0.88      0.86      4993
    Negative       0.87      0.83      0.85      5007

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [13]:
neg

{'other': -6.05339419502314,
 'reviewers': -9.515011110419294,
 'has': -5.5043077650296155,
 'mentioned': -8.77034605806507,
 'after': -6.184675921286312,
 'watching': -6.536929668978409,
 'just': -5.164633603328653,
 'oz': -10.617778529719365,
 'episode': -8.027112415411757,
 'll': -7.081122975345962,
 'hooked': -10.868894702646728,
 'they': -4.95079490570018,
 'right': -7.032237579539931,
 'exactly': -8.146320689743977,
 'what': -5.375253404473833,
 'happened': -8.097142637175498,
 'me': -5.7795433198279085,
 'first': -6.096223815213024,
 'thing': -6.470269304694201,
 'struck': -10.512487412722173,
 'about': -5.339716981276192,
 'its': -6.253597555730206,
 'brutality': -11.2562377678572,
 'unflinching': -13.496421971046448,
 'scenes': -6.488866048334003,
 'violence': -8.143978905405834,
 'which': -5.778664203722478,
 'set': -7.377590828018025,
 'from': -5.23442055639075,
 'word': -8.158112583940905,
 'go': -6.546589189727644,
 'trust': -9.401871668673433,
 'show': -6.421678136613664,

In [14]:
neg, pos, mnb, transformed_test_reviews = generate_log_prob(train_reviews, train_sentiments, tfidf = True, alpha = 0.05)

In [15]:
mnb_tfidf_predict = mnb.predict(transformed_test_reviews)
mnb_tfidf_score = accuracy_score(test_sentiments, mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)


mnb_tfidf_score : 0.861


In [16]:
#Classification report for TF-IDF
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.85      0.87      0.86      4993
    Negative       0.87      0.85      0.86      5007

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [17]:
pos

{'other': -6.61373323316104,
 'reviewers': -9.4493756842956,
 'has': -6.177054775590546,
 'mentioned': -8.901721837235222,
 'after': -6.835491609841304,
 'watching': -7.113564944400986,
 'just': -6.336349598037546,
 'oz': -9.393933257120523,
 'episode': -7.586088873734383,
 'll': -7.352460318013578,
 'hooked': -9.186006208037954,
 'they': -6.138145330261967,
 'right': -7.310093382746379,
 'exactly': -8.30259030947042,
 'what': -6.2820004594009475,
 'happened': -8.31549642660645,
 'me': -6.4188592235262805,
 'first': -6.599485998439569,
 'thing': -7.3459267014483105,
 'struck': -9.614909588143169,
 'about': -6.224767588846124,
 'its': -6.5568338633999055,
 'brutality': -10.327843326522812,
 'unflinching': -11.478177485285944,
 'scenes': -7.056571605090123,
 'violence': -8.153016959001066,
 'which': -6.480663241122133,
 'set': -7.607914476900691,
 'from': -6.130654026069499,
 'word': -8.412850883818319,
 'go': -7.051425228155263,
 'trust': -9.132609727070754,
 'show': -6.607391208848362,

#### Bag of Words model

In [None]:
#Count vectorizer for bag of words
# This part will transform the reviews into a sparce matrix where each row represent the appearance of each word/phrase in the vocabulary
cv=CountVectorizer(min_df=2,max_df=0.5,ngram_range=(1,1))
cv_train_reviews=cv.fit_transform(train_reviews)
cv_test_reviews=cv.transform(test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

In [None]:
#training the model
mnb_1 = MultinomialNB(alpha=0.05)

#fitting the naive bayes for bag of words
mnb_bow = mnb_1.fit(cv_train_reviews, train_sentiments)
bow_negative_log_prob = mnb_bow.feature_log_prob_[0]
bow_positive_log_prob = mnb_bow.feature_log_prob_[1]

# Generate two dict: word:log_prob
bow_negative_word_log_prob_dict = {}
bow_positive_word_log_prob_dict = {}
for word, index in cv.vocabulary_.items():
    bow_negative_word_log_prob_dict[word] = bow_negative_log_prob[index]
    bow_positive_word_log_prob_dict[word] = bow_positive_log_prob[index]
    

In [None]:
mnb_bow_predict = mnb.predict(transformed_test_reviews)
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)

In [None]:
#Predicting the model for bag of words
mnb_bow_predict = mnb_bow.predict(cv_test_reviews)
#Accuracy score for bag of words
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)

In [None]:
wrong_labeled_NB_BOW = ""

for i in range(mnb_bow_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_bow_predict[i] != test_sentiments[start_index + i]):
        wrong_labeled_NB_BOW += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("Naive Bayes - Bag of Words Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_NB_BOW)
f.close()

#### TF-IDF model
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=2,max_df=0.5,use_idf=True, ngram_range=(1,4))
#transformed train reviews
tv_train_reviews=tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

In [None]:
mnb_2 = MultinomialNB(alpha = 0.05)
#fitting the naive bayes for tfidf features
mnb_tfidf = mnb_2.fit(tv_train_reviews, train_sentiments)
tfidf_negative_log_prob = mnb_tfidf.feature_log_prob_[0]
tfidf_positive_log_prob = mnb_tfidf.feature_log_prob_[1]

# Generate two dict: word:log_prob
tfidf_negative_word_log_prob_dict = {}
tfidf_positive_word_log_prob_dict = {}
for word, index in cv.vocabulary_.items():
    tfidf_negative_word_log_prob_dict[word] = tfidf_negative_log_prob[index]
    tfidf_positive_word_log_prob_dict[word] = tfidf_positive_log_prob[index]

In [None]:
#Predicting the model for tfidf features
mnb_tfidf_predict = mnb_tfidf.predict(tv_test_reviews)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

In [None]:
#Classification report for tfidf features
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

In [None]:
wrong_labeled_NB_TFIDF = ""
for i in range(mnb_tfidf_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_tfidf_predict[i] != test_sentiments[start_index + i]):
        wrong_labeled_NB_TFIDF += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("Naive Bayes - TF-IDF Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_NB_TFIDF)
f.close()

In [None]:
bow_negative_word_log_prob_dict

In [None]:
tfidf_negative_word_log_prob_dict
tfidf_positive_word_log_prob_dict

# fastText method

In [None]:
import fasttext
test_reviews.index.stop

In [None]:
#preprocess the data so it can feed into fasttext
def transform_instance(doc, label):
    processed_text = []
    #Prefix the index-ed label with __label__
    for i in range(doc.index.start, doc.index.stop):
        cur_row = "__label__" + label[i] + " " + doc[i]
        processed_text.append(cur_row)
    return processed_text

training_text = transform_instance(train_reviews, train_sentiments)
test_text = transform_instance(test_reviews, test_sentiments)

# Put the training and test dataset into file, so that the fasttext model can read them.
f = open("fasttext.train", "w")
output = ""
for text in training_text:
    output += text
    output += "\n"
f.write(output)
f.close()

f = open("fasttext.test", "w")
output = ""
for text in test_text:
    output += text
    output += "\n"
f.write(output)
f.close()

In [None]:
len(training_text), len(test_text)

In [None]:
model = fasttext.train_supervised(input = "fasttext.train", lr=0.1, epoch=25, wordNgrams=3)

In [None]:
model.save_model("model_movie.bin")

In [None]:
model.test("fasttext.test")


In [None]:
# get the misclassified examples by fasttext algorithm
wrong_labeled_fasttext = ""
for i in range(test_reviews.size):
    start_index = test_sentiments.index.start
    if(model.predict(test_reviews[start_index + i])[0][0] != str('__label__' + test_sentiments[start_index + i])):
        wrong_labeled_fasttext += str(start_index + i) + " " + str(imdb_data['sentiment'][start_index + i]) + " | " + str(imdb_data['review'][start_index + i]) +  "\n"

f = open("fastText Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_fasttext)
f.close


# LSTM with pytorch
https://www.kaggle.com/code/fantaszzhang/deep-learning-for-sentiment-analysis/edit