In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import the dataset

In [2]:
imdb_data = pd.read_csv('./IMDB_Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### Generate the Training and Testing data by dividing the dataset as 4:1

In [4]:
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]

test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]

train_reviews.shape,train_sentiments.shape, test_reviews.shape,test_sentiments.shape


((40000,), (40000,), (10000,), (10000,))

### Make sure that the split is balanced in general

In [5]:
train_sentiments.value_counts()

negative    20007
positive    19993
Name: sentiment, dtype: int64

### Preprocessing the dataset

In [6]:
tokenizer = ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

train_reviews = train_reviews.apply(remove_special_characters)
test_reviews = test_reviews.apply(remove_special_characters)

#Stemming the text, e.g. am, are, is -> be
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

train_reviews = train_reviews.apply(simple_stemmer)
test_reviews = test_reviews.apply(simple_stemmer)



In [7]:
#set stopwords to english
stop = set(stopwords.words('english'))
print(stop)

{'if', 'has', 'were', 'her', 'shouldn', 'this', 'then', 'no', 'it', 'ourselves', 'with', 'where', 'mustn', 'be', "shouldn't", 'some', 'isn', 'was', 'by', 'am', 'ma', 'other', 'have', "couldn't", "you'll", 'theirs', 'why', 'they', 'down', 'their', 'shan', 'yourself', 'your', 'being', 'which', 'we', 'who', 'on', 'you', 'm', 'to', "she's", 'are', 'myself', 'weren', 'itself', 'from', 'yourselves', 'during', 'both', 'should', 'before', 'don', "wasn't", 'me', 'been', 'had', 'below', 'off', 'too', 'an', 'our', 'himself', 'the', 'until', 't', 'there', 'very', 'any', 'but', 'aren', 'against', "you've", 'in', 'hasn', 'own', 're', 'most', 'd', 'o', 'herself', 'into', "didn't", 'nor', 'further', 'not', "it's", "wouldn't", 'ain', 'over', 'is', "hasn't", "mustn't", 'more', "don't", 'didn', 'hers', "you'd", 'will', 'than', 'of', 'doing', 'ours', 'doesn', "doesn't", 'just', 'such', 'having', 'my', 'above', 'each', 'same', 'again', "mightn't", 'for', 'i', 'under', "weren't", 'at', "shan't", 'so', 'him'

In [8]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

train_reviews = train_reviews.apply(remove_stopwords)
test_reviews = test_reviews.apply(remove_stopwords)


* Problem: the stopwords used here include negative words like "not"

## Param explanation
### min_df
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.
### max_df
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer

In [9]:
#Count vectorizer for bag of words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
cv=CountVectorizer(min_df=2,max_df=0.5,binary=False,ngram_range=(1,3))
cv_train_reviews=cv.fit_transform(train_reviews)
cv_test_reviews=cv.transform(test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)


BOW_cv_train: (40000, 809019)
BOW_cv_test: (10000, 809019)


In [10]:
#Tfidf vectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tv=TfidfVectorizer(min_df=2,max_df=0.5,use_idf=True, ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 809019)
Tfidf_test: (10000, 809019)


 # Naive bayes model
 https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [11]:
#training the model
mnb=MultinomialNB()
#fitting the naive bayes for bag of words
mnb_bow = mnb.fit(cv_train_reviews, train_sentiments)
#fitting the naive bayes for tfidf features
mnb_tfidf = mnb.fit(tv_train_reviews, train_sentiments)


In [12]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']
['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']


### Tracking those wrong labeled data
The doc with their index in test set, and the actual sentiment (label) will be output to text

In [13]:
wrong_labeled_NB_BOW = ""

for i in range(mnb_bow_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_bow_predict[i] != test_sentiments[start_index + i]):
        wrong_labeled_NB_BOW += str(start_index + i) + " " + str(test_sentiments[start_index + i]) + " | " + str(test_reviews[start_index + i]) +  "\n"

f = open("Naive Bayes - Bag of Words Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_NB_BOW)
f.close

wrong_labeled_NB_TFIDF = ""
for i in range(mnb_tfidf_predict.size):
    start_index = test_sentiments.index.start
    if(mnb_tfidf_predict[i] != test_sentiments[start_index + i]):
        wrong_labeled_NB_TFIDF += str(start_index + i) + " " + str(test_sentiments[start_index + i]) + " | " + str(test_reviews[start_index + i]) +  "\n"

f = open("Naive Bayes - TF-IDF Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_NB_TFIDF)
f.close


<function TextIOWrapper.close()>

In [14]:
#Accuracy score for bag of words
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)

#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.8871
mnb_tfidf_score : 0.8931


In [15]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)

#Classification report for tfidf features
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.88      0.90      0.89      4993
    Negative       0.90      0.88      0.89      5007

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

              precision    recall  f1-score   support

    Positive       0.89      0.89      0.89      4993
    Negative       0.89      0.89      0.89      5007

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [16]:
f = open("output.txt", "w")
output = ""

output += str(cv.vocabulary_)
f.write(output)
f.close()


# fastText method

In [17]:
import fasttext
test_reviews.index.stop

50000

In [18]:
#preprocess the data so it can feed into fasttext
def transform_instance(doc, label):
    processed_text = []
    #Prefix the index-ed label with __label__
    for i in range(doc.index.start, doc.index.stop):
        cur_row = "__label__" + label[i] + " " + doc[i]
        processed_text.append(cur_row)
    return processed_text

training_text = transform_instance(train_reviews, train_sentiments)
test_text = transform_instance(test_reviews, test_sentiments)

# Put the training and test dataset into file, so that the fasttext model can read them.
f = open("fasttext.train", "w")
output = ""
for text in training_text:
    output += text
    output += "\n"
f.write(output)
f.close()

f = open("fasttext.test", "w")
output = ""
for text in test_text:
    output += text
    output += "\n"
f.write(output)
f.close()

In [19]:
len(training_text), len(test_text)

(40000, 10000)

In [20]:
model = fasttext.train_supervised(input = "fasttext.train", lr=0.1, epoch=25, wordNgrams=3)

In [21]:
model.save_model("model_movie.bin")

In [22]:
model.test("fasttext.test")


(10000, 0.9019, 0.9019)

In [23]:
# get the misclassified examples by fasttext algorithm
wrong_labeled_fasttext = ""
for i in range(test_reviews.size):
    start_index = test_sentiments.index.start
    if(model.predict(test_reviews[start_index + i])[0][0] != str('__label__' + test_sentiments[start_index + i])):
        wrong_labeled_fasttext += str(start_index + i) + " " + str(test_sentiments[start_index + i]) + " | " + str(test_reviews[start_index + i]) +  "\n"

f = open("fastText Wrongly labeled sentences.txt", "w")
f.write(wrong_labeled_fasttext)
f.close


<function TextIOWrapper.close()>

# LSTM with pytorch
https://www.kaggle.com/code/fantaszzhang/deep-learning-for-sentiment-analysis/edit