In [97]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',255)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
dataset = pd.read_csv("D:\GenAI\Data\IMDB_Dataset.csv")
dataset.head()

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of v...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen-...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well b...",positive
3,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br...",negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situ...",positive


In [75]:
dataset.shape

(50000, 2)

In [76]:
df = dataset.sample(n=10000)
df.shape

(10000, 2)

In [77]:
df['sentiment'].value_counts()

negative    5011
positive    4989
Name: sentiment, dtype: int64

In [78]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [79]:
print(f"Duplicates: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Duplicates: {df.duplicated().sum()}")

Duplicates: 22
Duplicates: 0


### Basic Preprocessing
- Remove Tags - HTML
- Lower case
- Remove Stopwords

In [80]:
import re
def remove_tags(raw_text):
    tag_removed_text = re.sub(re.compile("<.*?>"),"",raw_text)
    return tag_removed_text

def lower_case(text):
    lower_text = text.lower()
    return lower_text

sw_list = stopwords.words('english')
def remove_stopwords(text):
    clean_text_token = [item for item in text.split() if item not in sw_list]
    clean_text_sentence = " ".join(clean_text_token)
    return clean_text_sentence

In [81]:
df['review'] = df['review'].apply(remove_tags)
df['review'] = df['review'].apply(lower_case)
df['review'] = df['review'].apply(remove_stopwords)

X = df.iloc[:,0:1]
y = df['sentiment']

df.head()

Unnamed: 0,review,sentiment
46991,"lot wrong film. lie. say problems feel like stem budget chopped underneath flick, bad hack job editing.this office space. go expecting office space levels comedy. funny though. mess, funny time. funny mess film. way caddyshack funny. mess unrelated fu...",positive
5721,"first saw movie, thought typical ""love thy neighbour"" stuff....the movie going on, got involved. acting magnificent actors, direction great, story unusual. cried eyes off, first time life movie. real must serious videoteque. 11 10",positive
44495,"lawrence olivier merle oberon two movies together within two years. one considered one great romantic films time, movie made olivier great movie star (and gave oberon best performance role): wuthering heights. film, made england year earlier. divorce ...",positive
13778,"batman superman. iconic. better part century old. know two? must countless fans would die make film them. sandy collora went ahead put together trailer film(which exist, created, much less team). perhaps going polished is. throughout, cinematography s...",positive
30565,"prepared love ""where's poppa"", features nexus normal lear sitcom character actors who, growing up, felt like extended members raisenette-sized broken nuclear family. fun would see censor-free barnard hughes, vincent gardenia, ron liebman, rob reiner, ...",negative


In [82]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [83]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
print(f"train_shape: {X_train.shape} {y_train.shape}")
print(f"test_shape: {X_test.shape} {y_test.shape}")

train_shape: (7982, 1) (7982,)
test_shape: (1996, 1) (1996,)


In [84]:
## Applying BOW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [86]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Train with Naive Bayes Model

In [87]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_bow,y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [91]:
y_pred = gnb.predict(X_test_bow)

accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.6292585170340681


Train with Random Forest Classifier

In [92]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.8502004008016032


Restrict the feature count 3000 instead of taking all the features

In [93]:
cv = CountVectorizer(max_features =3000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.8406813627254509


N-GRAMS

In [96]:
cv = CountVectorizer(ngram_range=(2,2), max_features =5000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.7575150300601202


TFIDF

In [169]:
tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train['review']).toarray()
X_test_vec = tfidf.transform(X_test['review']).toarray()

rf = RandomForestClassifier()
rf.fit(X_train_vec,y_train)
y_pred = rf.predict(X_test_vec)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.843186372745491


In [173]:
test_review = ["good movie"]
vec = tfidf.transform(test_review).toarray()
y_pred = rf.predict(vec)
y_pred

array([1])

Word2Vec

In [174]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

X_train_tokens = X_train['review'].apply(lambda x: simple_preprocess(x))
X_test_tokens = X_test['review'].apply(lambda x: simple_preprocess(x))

w2v_model = Word2Vec(sentences = X_train_tokens, window = 10, min_count=2)

def vectorize(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X_train_vec = np.array([vectorize(token,w2v_model) for token in X_train_tokens])
X_test_vec = np.array([vectorize(token,w2v_model) for token in X_test_tokens])

rf =RandomForestClassifier()
rf.fit(X_train_vec,y_train)
y_pred = rf.predict(X_test_vec)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.7850701402805611


In [176]:
test_review = "good movie"
token = simple_preprocess(test_review)
vector = vectorize(token,w2v_model).reshape(1,-1)
rf.predict(vector)

array([1])