In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re, string, os, warnings
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data = pd.read_csv('IMDB Dataset.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().any()

In [None]:
data['sentiment'].value_counts()

In [None]:
train_reviews = data.review[:40000]
train_sentiments = data.sentiment[:40000]

test_reviews = data.review[40000:]
test_sentiments = data.sentiment[40000:]

print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [None]:
len(stopword_list)

In [None]:
stopword_list

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

data['review'] = data['review'].apply(denoise_text)

In [None]:
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

data['review'] = data['review'].apply(remove_special_characters)

In [None]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

data['review'] = data['review'].apply(simple_stemmer)

In [None]:
stop = set(stopwords.words('english'))

def remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

data['review'] = data['review'].apply(remove_stopwords)

In [None]:
norm_train_reviews = data.review[:40000]
norm_train_reviews[0]

In [None]:
norm_test_reviews = data.review[40000:]
norm_test_reviews[45005]

In [None]:
cv = CountVectorizer(min_df = 0, max_df = 1, binary = False, ngram_range = (1,3))

cv_train_reviews = cv.fit_transform(norm_train_reviews)
cv_test_reviews = cv.transform(norm_test_reviews)

print('BOW_cv_train:', cv_train_reviews.shape)
print('BOW_cv_test:', cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

In [None]:
tv = TfidfVectorizer(min_df = 0, max_df = 1, use_idf = True, ngram_range = (1,3))

tv_train_reviews = tv.fit_transform(norm_train_reviews)
tv_test_reviews = tv.transform(norm_test_reviews)

print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

In [None]:
lb = LabelBinarizer()

sentiment_data = lb.fit_transform(data['sentiment'])
print(sentiment_data.shape)

In [None]:
data['sentiment'].head()

In [None]:
sentiment_data

In [None]:
train_sentiments = sentiment_data[:40000]
test_sentiments = sentiment_data[40000:]

print(train_sentiments)
print(test_sentiments)

# Logistic Regression

In [None]:
lr = LogisticRegression(penalty = 'l2', max_iter = 500, C = 1, random_state = 42)

lr_cv = lr.fit(cv_train_reviews, train_sentiments)
print(lr_cv)

lr_tfidf = lr.fit(tv_train_reviews, train_sentiments)
print(lr_tfidf)

In [None]:
lr_cv_predict = lr.predict(cv_test_reviews)
print(lr_cv_predict)

lr_tfidf_predict = lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

In [None]:
lr_cv_score = accuracy_score(test_sentiments, lr_cv_predict)
print("lr_cv_score :", lr_cv_score)

lr_tfidf_score = accuracy_score(test_sentiments, lr_tfidf_predict)
print("lr_tfidf_score :", lr_tfidf_score)

In [None]:
lr_cv_report = classification_report(test_sentiments, lr_cv_predict, target_names = ['Positive','Negative'])
print(lr_cv_report)

lr_tfidf_report = classification_report(test_sentiments, lr_tfidf_predict, target_names = ['Positive','Negative'])
print(lr_tfidf_report)

In [None]:
cm_cv = confusion_matrix(test_sentiments, lr_cv_predict, labels = [1,0])
print(cm_cv)

cm_tfidf = confusion_matrix(test_sentiments, lr_tfidf_predict, labels = [1,0])
print(cm_tfidf)

# Support Vector Machine 

In [None]:
svm = SGDClassifier(loss = 'hinge', max_iter = 500, random_state = 42)

svm_cv = svm.fit(cv_train_reviews, train_sentiments)
print(svm_cv)

svm_tfidf = svm.fit(tv_train_reviews, train_sentiments)
print(svm_tfidf)

In [None]:
svm_cv_predict = svm.predict(cv_test_reviews)
print(svm_cv_predict)

svm_tfidf_predict = svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

In [None]:
svm_cv_score = accuracy_score(test_sentiments, svm_cv_predict)
print("svm_cv_score :", svm_cv_score)

svm_tfidf_score = accuracy_score(test_sentiments, svm_tfidf_predict)
print("svm_tfidf_score :", svm_tfidf_score)

In [None]:
svm_cv_report = classification_report(test_sentiments, svm_cv_predict, target_names = ['Positive','Negative'])
print(svm_cv_report)

svm_tfidf_report = classification_report(test_sentiments, svm_tfidf_predict, target_names = ['Positive','Negative'])
print(svm_tfidf_report)

In [None]:
cm_cv = confusion_matrix(test_sentiments, svm_cv_predict, labels = [1,0])
print(cm_cv)

cm_tfidf = confusion_matrix(test_sentiments, svm_tfidf_predict, labels = [1,0])
print(cm_tfidf)

# Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()

mnb_cv = mnb.fit(cv_train_reviews, train_sentiments)
print(mnb_cv)

mnb_tfidf = mnb.fit(tv_train_reviews, train_sentiments)
print(mnb_tfidf)

In [None]:
mnb_cv_predict = mnb.predict(cv_test_reviews)
print(mnb_cv_predict)

mnb_tfidf_predict = mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

In [None]:
mnb_cv_score = accuracy_score(test_sentiments, mnb_cv_predict)
print("mnb_cv_score :", mnb_cv_score)

mnb_tfidf_score = accuracy_score(test_sentiments, mnb_tfidf_predict)
print("mnb_tfidf_score :", mnb_tfidf_score)

In [None]:
mnb_cv_report = classification_report(test_sentiments, mnb_cv_predict, target_names = ['Positive','Negative'])
print(mnb_cv_report)

mnb_tfidf_report = classification_report(test_sentiments, mnb_tfidf_predict, target_names = ['Positive','Negative'])
print(mnb_tfidf_report)

In [None]:
cm_cv = confusion_matrix(test_sentiments, mnb_cv_predict, labels = [1,0])
print(cm_cv)

cm_tfidf = confusion_matrix(test_sentiments, mnb_tfidf_predict, labels = [1,0])
print(cm_tfidf)

# Visualization of positive and negative words in cloud format 

In [None]:
plt.figure(figsize = (10,10))
positive_text = norm_train_reviews[1]
WC = WordCloud(width = 1000, height = 500, max_words = 500, min_font_size = 5)
positive_words = WC.generate(positive_text)
plt.imshow(positive_words, interpolation = 'bilinear')
plt.show

In [None]:
plt.figure(figsize = (10,10))
negative_text = norm_train_reviews[8]
WC = WordCloud(width = 1000, height = 500, max_words = 500, min_font_size = 5)
negative_words = WC.generate(negative_text)
plt.imshow(negative_words, interpolation = 'bilinear')
plt.show