In [1]:
import numpy as np
import pandas as pd

## Data Acquisition

In [2]:
data = pd.read_csv("IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data = data.iloc[:25000,:]

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data.duplicated().sum()

103

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.duplicated().sum()

0

In [10]:
data['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

## Basic Preprocessing
    - Removal of tags
    - Lowercase
    - Removal of stopwords

In [11]:
import re

In [12]:
def removal_tag(raw_text):
    clean_text = re.sub(re.compile('<.*?>'),'',raw_text)
    return clean_text.replace('\\','')

In [13]:
data['review'] = data['review'].apply(removal_tag)

In [14]:
data['review'] = data['review'].apply(lambda x: x.lower())

In [15]:
from nltk.corpus import stopwords

In [16]:
stop_words = stopwords.words('english')

In [17]:
review = []

In [18]:
def removal_stopwords(raw_text):
    for word in raw_text.split():
        if word not in stop_words:
            review.append(word)
    return review

In [19]:
data['review'] = data['review'].apply(lambda x: [item for item in x.split() if item not in stop_words]).apply(lambda x:" ".join(x))

In [20]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [21]:
import gensim

In [22]:
from gensim.utils import simple_preprocess

In [23]:
print(' '. join(simple_preprocess(data['review'][2])))

thought wonderful way spend time hot summer weekend sitting air conditioned theater watching light hearted comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point risk addiction thought proof woody allen still fully control style many us grown love this laughed one woody comedies years dare say decade ve never impressed scarlet johanson managed tone sexy image jumped right average spirited young woman this may crown jewel career wittier devil wears prada interesting superman great comedy go see friends


In [25]:
data['review'] = data['review'].apply(lambda x: simple_preprocess(x)).apply(lambda x: " ".join(x))

In [27]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

In [28]:
X = data.iloc[:,:1]

In [29]:
X.head()

Unnamed: 0,review
0,one reviewers mentioned watching oz episode ho...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically there family little boy jake thinks ...
4,petter mattei love time money visually stunnin...


In [30]:
X.shape

(24897, 1)

In [31]:
y = data['sentiment']

In [32]:
y.shape

(24897,)

In [33]:
y = encoder.fit_transform(y)

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [36]:
X_train.shape

(18672, 1)

In [37]:
X_test.shape

(6225, 1)

## Feature Extraction(Bag of Words/n-grams/bi-grams)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
cv = CountVectorizer(max_features=5000)

In [40]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()

In [41]:
X_test_bow = cv.fit_transform(X_test['review']).toarray()

In [42]:
X_train_bow.shape

(18672, 5000)

In [43]:
X_test_bow.shape

(6225, 5000)

## Model Selection

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
model = RandomForestClassifier()

In [46]:
model.fit(X_train_bow,y_train)

In [47]:
y_pred = model.predict(X_test_bow)

In [48]:
from sklearn.metrics import accuracy_score

In [49]:
accuracy_score(y_test, y_pred)

0.5794377510040161

In [50]:
from sklearn.naive_bayes import GaussianNB

In [51]:
gb = GaussianNB()

In [52]:
gb.fit(X_train_bow,y_train)

In [53]:
y_pred = gb.predict(X_test_bow)

In [54]:
accuracy_score(y_test, y_pred)

0.48690763052208835

## TF-Idf Feature Extraction

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tdf = TfidfVectorizer(max_features=30000)

In [57]:
X_train_tdf = tdf.fit_transform(X_train['review']).toarray()

In [58]:
X_test_tdf = tdf.fit_transform(X_test['review']).toarray()

In [59]:
X_train_tdf.shape

(18672, 30000)

In [60]:
X_test_tdf.shape

(6225, 30000)

In [61]:
from sklearn.ensemble import RandomForestClassifier

In [62]:
rdf = RandomForestClassifier()

In [63]:
rdf.fit(X_train_tdf, y_train)

In [197]:
y_pred = rdf.predict(X_test_tdf)

In [201]:
accuracy_score(y_test, y_pred)

0.5004016064257029

In [202]:
from sklearn.naive_bayes import GaussianNB

In [203]:
gnb = GaussianNB()

In [204]:
gnb.fit(X_train_tdf, y_train)

In [205]:
y_pred = gnb.predict(X_test_tdf)

In [206]:
accuracy_score(y_test, y_pred)

0.526425702811245