# **Import libraries**

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Description of IMDB dataset
# IMDB dataset having 50K movie reviews for natural language processing or Text analytics. 

data = pd.read_csv("IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [7]:
data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [8]:
data.shape

(50000, 2)

# **Text Normalization**

## **Tokenization**

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [10]:
import spacy 
import re, string, unicodedata
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from bs4 import BeautifulSoup

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
#tokenization of text
from nltk.tokenize.toktok import ToktokTokenizer
tokenizers = ToktokTokenizer()

#Setting English stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [18]:
#Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [20]:
#Apply function on review column 
data['review'] = data['review'].apply(noiseremoval_text)

In [21]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Stemming

In [25]:
# Stemming the text
def stemmer(text):
    ps = nltk.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [26]:
#Apply function on review column
data['review'] = data['review'].apply(stemmer)

In [27]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,A wonder littl production. the film techniqu i...,positive
2,I thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


# Removing stop words

In [28]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [29]:
#set stopwords to english

stop_wr = set(stopwords.words('english'))
print(stop_wr)

{'where', 'the', 'doesn', 'their', "didn't", 'i', 'then', 'out', 'isn', 'are', "wouldn't", 'as', 'them', 'each', 'shan', 'themselves', "couldn't", 'on', 'but', 'whom', 'her', 'nor', "doesn't", 'has', 'through', 'ma', 'too', 'hasn', 'up', 'him', 'against', 'can', 'few', 'above', 'both', 'yours', 'into', 'ours', 'its', 'such', 'of', 'down', 'we', 'ain', 'shouldn', 'do', 'most', 'how', "won't", 'having', 'who', 'll', 're', 'needn', "shouldn't", 'when', 'she', 'our', 'they', 'itself', 'weren', "hasn't", 'while', 'won', 'about', "hadn't", "haven't", 'aren', 'didn', 's', "you'll", 'after', 'being', 'why', 'd', 'by', 'because', 'your', 'does', 'now', 'what', 'don', 'a', "you'd", "wasn't", 'below', 'is', 'this', 'his', 'or', 'was', 'own', 'those', 'be', 'not', 'that', 'myself', "she's", 'very', 'only', 'in', 'during', 'other', 'to', 'if', 'just', 'were', 'off', 't', 'will', 'before', 'between', 'same', "you've", "should've", 'it', 'did', 'than', 'you', 'my', 'herself', "shan't", 'under', 'furt

In [30]:
#removing the stopwords

def removing_stopwords(text, is_lower_case = False):
    #Tokenization of text
    tokenizers = ToktokTokenizer()
    
    #Setting english stopwords 
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filter_tokens = [token for token in tokens if token not in stop_wr]
    else:
        filter_tokens = [token for token in tokens if token.lower() not in stop_wr]
    filtered_text = ' '.join(filter_tokens)
    return filtered_text

In [31]:
#Apply function on review column 
data['review'] = data['review'].apply(removing_stopwords)

In [32]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


# Train test split

In [33]:
#split the dataset
#train dataset

train_reviews_data = data.review[:30000]

In [35]:
#test dataset
test_reviews_data = data.review[30000:]

# Bag of words

In [36]:
#count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))

#transformed train reviews
cv_train = cv.fit_transform(train_reviews_data)

#transformed test reviews
cv_test = cv.transform(test_reviews_data)

print('BOW_cv_train:', cv_train.shape)
print('BOW_cv_train:', cv_train.shape)

BOW_cv_train: (30000, 4954557)
BOW_cv_train: (30000, 4954557)


# TF_IDF

In [38]:
#Tfidf vectorizer
tf = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))

#transformed train reviews
tf_train= tf.fit_transform(train_reviews_data)

#transformed test reviews
tf_test = tf.transform(test_reviews_data)

print('Tfidf_train:', tf_train.shape)
print('Tfidf_test:', tf_test.shape)

Tfidf_train: (30000, 4954557)
Tfidf_test: (20000, 4954557)


# Lable encoding

In [39]:
#labelling the sentiment data
label = LabelBinarizer()

#transformed sentiment data
sentiment_data = label.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [40]:
train_data = data.sentiment[:30000]

In [41]:
test_data = data.sentiment[30000:]

In [42]:
#train the model
logistic = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

#fitting the model for bag of words
lr_bow = logistic.fit(cv_train,train_data)
print(lr_bow)

#fitting the model for tfidf features
lr_tfidf= logistic.fit(tf_train,train_data)
print(lr_tfidf)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [43]:
#Predicting the model for bag of words
lr_bow_predict = logistic.predict(cv_test)
print(lr_bow_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [44]:
#Predicting the model for tfid features
lr_tfidf_predict = logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [45]:
#Accuracy score of bag of words
lr_bow_score = accuracy_score(test_data, lr_bow_predict)
print("lr_bow_score : ",lr_bow_score)

lr_bow_score :  0.74255


In [46]:
#accuracy score for tfidf features
lr_tfidf_score = accuracy_score(test_data, lr_tfidf_predict)
print("lr_tfidf_score : ", lr_tfidf_score)

lr_tfidf_score :  0.7426
