
Sentiment Analysis on Movie Reviews

Importing Libraries

In [3]:
#Sentiment Analysis on Movie Reviews

#Importing Libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
import string
import spacy

Loading the dataset

In [4]:
#Loading the dataset
data = pd.read_csv('/content/labeledTrainData.tsv', sep ='\t')

In [5]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [6]:
data.columns

Index(['id', 'sentiment', 'review'], dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


Data Preprocessing

In [8]:
#Data Preprocessing

Check for null values

In [9]:
#Check for null values
data.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

Text Cleaning
Lower Casing

In [10]:
#Text Cleaning
#Lower Casing

data['review'] = data['review'].str.lower()
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi..."
2,7759_3,0,the film starts with a manager (nicholas bell)...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


Removal of Punctuation marks

In [11]:
#Removal of Punctuation marks

def remove_punctuation(text):
    return text.translate(str.maketrans('', '',string.punctuation))
data['review'] = data['review'].apply(lambda text: remove_punctuation(text))
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,the classic war of the worlds by timothy hines...
2,7759_3,0,the film starts with a manager nicholas bell g...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


Removal of stop words

In [12]:
#Removal of stop words

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
data['review'] = data['review'].apply(lambda text: remove_stopwords(text))
data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war worlds timothy hines entertaining ...
2,7759_3,0,film starts manager nicholas bell giving welco...
3,3630_4,0,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...


Removal of frequent words

In [13]:
#Removal of frequent words

from collections import Counter
cnt = Counter()
for text in data['review'].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)


[('br', 57143),
 ('movie', 41807),
 ('film', 37455),
 ('one', 25508),
 ('like', 19641),
 ('good', 14555),
 ('even', 12503),
 ('would', 12135),
 ('time', 11779),
 ('really', 11663)]

In [14]:
freq_words = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in freq_words])

data['review'] = data['review'].apply(lambda text: remove_freqwords(text))
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war worlds timothy hines entertaining ...
2,7759_3,0,starts manager nicholas bell giving welcome in...
3,3630_4,0,must assumed praised greatest filmed opera eve...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...


Removal of Rare words

In [15]:
#Removal of Rare words

n_rare_words = 10
rare_words = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in rare_words])

data['review'] = data['review'].apply(lambda text: remove_rarewords(text))
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war worlds timothy hines entertaining ...
2,7759_3,0,starts manager nicholas bell giving welcome in...
3,3630_4,0,must assumed praised greatest filmed opera eve...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...


Lemmatization

In [16]:
#Lemmatization

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

data['review'] = data['review'].apply(lambda text: lemmatize_words(text))
data.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war world timothy hines entertaining o...
2,7759_3,0,start manager nicholas bell giving welcome inv...
3,3630_4,0,must assumed praised greatest filmed opera eve...
4,9495_8,1,superbly trashy wondrously unpretentious 80 ex...


Stemming

In [17]:
#Stemming

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

data['review'] = data['review'].apply(lambda text: stem_words(text))
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff go moment mj ive start listen music watc...
1,2381_9,1,classic war world timothi hine entertain obvio...
2,7759_3,0,start manag nichola bell give welcom investor ...
3,3630_4,0,must assum prais greatest film opera ever didn...
4,9495_8,1,superbl trashi wondrous unpretenti 80 exploit ...


Feature Extraction using CountVectorizer

In [18]:
#Feature Extraction using CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(stop_words = 'english',max_features = 20000)
bow = cvec.fit_transform(data['review'])
len(cvec.vocabulary_)

20000

In [19]:
#data['review'] -- feature
#data['sentiment'] -- target variable

x = pd.DataFrame(bow.todense())
y = data['sentiment']

Model Building - Logistic Regression

In [20]:
#Model Building - Logistic Regression

from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y, test_size = 0.2,random_state = 134)

In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=200)
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)

Model Performance Evaluation

In [22]:
#Model Performance Evaluation

from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.862

Feature Extraction using TfidfVectorizer

In [23]:
#Feature Extraction using TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
tfvec = TfidfVectorizer(stop_words = 'english',max_features = 20000)
tf = tfvec.fit_transform(data['review'])
len(tfvec.vocabulary_)

20000

In [24]:
#data['review'] -- feature
#data['sentiment'] -- target variable

x = pd.DataFrame(tf.todense())
y = data['sentiment']

Model Building - Logistic Regression

In [25]:
#Model Building - Logistic Regression
x_train , x_test , y_train , y_test = train_test_split(x , y, test_size = 0.2,random_state = 134)

In [26]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=200)
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)

Model Performance Evaluation

In [27]:
#Model Performance Evaluation

from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.8812

CountVectorizer:
With max_features = 10000 accuracy_score = 84.72
With max_features = 20000 accuracy_score = 86.2

TfidfVectorizer:
With max_features = 10000 accuracy_score = 87.94
With max_features = 20000 accuracy_score = 88.12
