In [1]:
#Proprietary content. © Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited.

## Dataset description: <br>
IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.

In [4]:
import numpy as np
import pandas as pd

In [5]:
#https://www.kaggle.com/datasets/columbine/imdb-dataset-sentiment-analysis-in-csv-format

data=pd.read_csv("IMDB Dataset.csv")


In [6]:
data.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [7]:
data.columns

Index(['text', 'label'], dtype='object')

In [8]:
data.isnull().any()

text     False
label    False
dtype: bool

In [9]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [10]:
data.describe()

Unnamed: 0,label
count,5000.0
mean,0.501
std,0.500049
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [11]:
data['label'].value_counts()

label
1    2505
0    2495
Name: count, dtype: int64

In [12]:
data.shape

(5000, 2)

# Text normalization
## tokenization

In [13]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [14]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup


In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anusha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
#Tokenization of text
tokenizers=ToktokTokenizer()
#Setting English stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [25]:
#Removing the noisy text
def noiseremoval_text(text):
  soup = BeautifulSoup(text, "html.parser")
  text = soup.get_text()
  text = re.sub('\[[^]]*\]', '', text)
  return text


In [26]:
#Apply function on review column
data['text']=data['text'].apply(noiseremoval_text)

  soup = BeautifulSoup(text, "html.parser")


In [27]:
data.head()

Unnamed: 0,text,label
0,i alway wrote thi seri off as be a complet sti...,0
1,1st watch 12/7/2002 - 3 out of 10(dir-stev pur...,0
2,thi movi wa so poorli written and direct i fel...,0
3,the most interest thing about miryang (secret ...,1
4,"when i first read about ""berlin am meer"" i did...",0


## Stemming

In [28]:
#Stemming the text
def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


In [29]:
#Apply function on review column
data['text']=data['text'].apply(stemmer)

In [30]:
data.head()

Unnamed: 0,text,label
0,i alway wrote thi seri off as be a complet sti...,0
1,1st watch 12/7/2002 - 3 out of 10(dir-stev pur...,0
2,thi movi wa so poorli written and direct i fel...,0
3,the most interest thing about miryang (secret ...,1
4,"when i first read about ""berlin am meer"" i did...",0


## Removing stop words

In [31]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  


In [34]:
#set stopwords to english

stop_wr=set(stopwords.words('english'))
print(stop_wr)

{'haven', 'shouldn', 'while', 'where', 'yourselves', 'other', 't', 'm', 'above', 'its', 'off', "won't", 'wouldn', 'shan', "wasn't", 'yours', 'into', 'we', 'has', 'on', 'too', "mightn't", 'himself', 'against', "you're", "you've", 'doesn', 'myself', 'theirs', 'between', 'so', 'needn', 'who', 'aren', 'hasn', 'now', 'after', 'am', 'any', 'it', 'how', 'at', 'nor', "should've", 'ours', "you'd", 'whom', 'just', 'had', 'she', "weren't", 'can', 'the', 'him', 'does', 'in', 'there', 'should', "needn't", 'same', 'our', 'very', "she's", 'was', 'by', 'then', 'this', 'more', 'don', 'of', 'as', "you'll", 's', "hadn't", 'an', 'during', 'll', 'ourselves', "doesn't", 'do', 'those', 'once', "didn't", 'most', "don't", 'me', 'is', 'some', 'or', 'are', 'again', 'weren', 'hers', "mustn't", 'which', 'no', 're', 'will', 'up', 'be', 'won', 'yourself', 'herself', 'with', 'out', 'didn', 'few', 'her', 'what', 'o', 'to', 'not', 'for', 'each', 'own', 'having', 'under', 'but', 'mustn', 'them', 'only', 'being', 'they',

In [35]:
#removing the stopwords
def removing_stopwords(text, is_lower_case=False):
    #Tokenization of text
    tokenizers=ToktokTokenizer()
    #Setting English stopwords
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filter_tokens = [token for token in tokens if token not in stop_wr]
    else:
        filter_tokens = [token for token in tokens if token.lower() not in stop_wr]
    filtered_text = ' '.join(filter_tokens)    
    return filtered_text


In [36]:
#Apply function on review column
data['text']=data['text'].apply(removing_stopwords)

In [37]:
data.head()

Unnamed: 0,text,label
0,alway wrote thi seri complet stink-fest becau ...,0
1,1st watch 12/7/2002 - 3 10( dir-stev purcell )...,0
2,thi movi wa poorli written direct fell asleep ...,0
3,interest thing miryang ( secret sunshine ) act...,1
4,"first read "" berlin meer "" ' expect much. thou...",0


## Train test split

In [44]:
#split the dataset  
#train dataset
train_reviews_data=data.text[:2500]


In [45]:
#test dataset

test_reviews_data=data.text[2501:]


## Bag of words

In [46]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train=cv.fit_transform(train_reviews_data)
#transformed test reviews
cv_test=cv.transform(test_reviews_data)

print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (2500, 532684)
BOW_cv_test: (2499, 532684)


## TF_IDF

In [47]:
#Tfidf vectorizer
tf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tf_train=tf.fit_transform(train_reviews_data)
#transformed test reviews
tf_test=tf.transform(test_reviews_data)
print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (2500, 532684)
Tfidf_test: (2499, 532684)


## Lable encoding

In [48]:
#labeling the sentient data
labeldata=LabelBinarizer()
#transformed sentiment data
sentiment_data=labeldata.fit_transform(data['label'])
print(sentiment_data.shape)

(5000, 1)


In [50]:
train_data=data.label[:2500]


In [51]:
test_data=data.label[2501:]


In [52]:
#training the model
logistic=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=logistic.fit(cv_train,train_data)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=logistic.fit(tf_train,train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [53]:
#Predicting the model for bag of words
lr_bow_predict=logistic.predict(cv_test)
print(lr_bow_predict)


[1 0 0 ... 0 1 1]


In [54]:
##Predicting the model for tfidf features
lr_tfidf_predict=logistic.predict(tf_test)
print(lr_tfidf_predict)

[0 0 0 ... 0 0 0]


In [55]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_data,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)


lr_bow_score : 0.6654661864745899


In [56]:
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_data,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.553421368547419
