# Loading data and basic cleaning

In [21]:
# Loading data
import numpy as np
import pandas as pd
df = pd.read_csv('movie.csv')
df = df.head(10000) # We are considering only first 10k rows
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [22]:
df['sentiment'].value_counts() # Classes are balanced

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,5028
negative,4972


In [23]:
# Missing values check
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [24]:
# Duplicated values check
df.duplicated().sum()

np.int64(17)

In [25]:
# Dropping Duplicates
df = df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

# Basic Text Preprocessing

In [26]:
# Remove tags
import re
def remove_tags(raw_text):
  cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
  return cleaned_text
df['review'] = df['review'].apply(remove_tags)

In [27]:
# Lowercase
df['review'] = df['review'].apply(lambda x:x.lower())

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# Remove stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in stop_words]).apply(lambda x:" ".join(x))

In [30]:
# Dividing data into X and Y
X = df.drop('sentiment', axis=1)
Y = df['sentiment']

In [31]:
# Encoding sentiment column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)

In [32]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Using BoW and n-grams

In [15]:
# Applying BoW for text vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [16]:
# Applying Naive Bayes for classification
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_bow, Y_train)
gnb.score(X_test_bow, Y_test)

0.6099148723084626

In [17]:
# Applying Random forest for classification
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_bow, Y_train)
rf.score(X_test_bow, Y_test)

0.8442663995993991

In [18]:
# Considering only top 3000 words for text vectorization and then applying Random forest for classification
cv = CountVectorizer(max_features=3000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()
rf.fit(X_train_bow, Y_train)
rf.score(X_test_bow, Y_test)

0.8352528793189785

In [19]:
# Applying bigrams for text vectorization and then applying Random forest for classification
cv = CountVectorizer(ngram_range=(2,2),max_features=5000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()
rf.fit(X_train_bow, Y_train)
rf.score(X_test_bow, Y_test)

0.7466199298948423

# Using TF-IDF

In [20]:
# Applying TF-IDF for text vectorization and then applying Random forest for classification
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

rf = RandomForestClassifier()
rf.fit(X_train_tfidf, Y_train)
rf.score(X_test_tfidf, Y_test)

0.8357536304456685

# Using Word2Vec

In [35]:
!pip install gensim



In [38]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [39]:
# Building a corpus of reviews after preprocessing and tokenizing the reviews
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

reviews = []
for doc in df['review']:
  raw_sent = sent_tokenize(doc)
  for sent in raw_sent:
    reviews.append(simple_preprocess(sent))

In [41]:
# Creating the Word2Vec model
import gensim
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

# Extracting unique words form corpus using build_vocab function
model.build_vocab(reviews)

# Training the deep learning model
model.train(reviews, total_examples=model.corpus_count, epochs=model.epochs)

(5849802, 6186875)

In [42]:
# Function to convert the reviews into vectors
def document_vector(doc):
  # Remove out of vocabulary words
  doc = [word for word in doc.split() if word in model.wv.index_to_key]
  return np.mean(model.wv[doc], axis=0)

In [43]:
# Converting the reviews into vectors
X = []
for doc in df['review'].values:
  X.append(document_vector(doc))
X = np.array(X)

In [50]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [51]:
# Applying Naive Bayes for classification
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, Y_train)
gnb.score(X_test, Y_test)

0.7230846269404106

In [52]:
# Applying Random forest for classification
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)

0.7651477215823735