#### Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

#### Importing the Dataset

In [2]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset = dataset.iloc[:5000]
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Checking the distribution of sentiments

In [3]:
dataset['sentiment'].value_counts()

negative    2532
positive    2468
Name: sentiment, dtype: int64

In [4]:
dataset.isnull().sum()

review       0
sentiment    0
dtype: int64

#### Checking and Deleting duplicate values

In [5]:
dataset.duplicated().sum()

3

In [6]:
dataset.drop_duplicates(inplace = True)

#### Basic preprocessing steps


In [7]:
def Lower_Case(text):
    text = text.lower()
    return text

dataset['review'] = dataset['review'].apply(Lower_Case)

In [8]:
def remove_tags_urls(text):
    pattern = re.compile('<.*?>')
    text = pattern.sub(r'', text)
    return text

dataset['review'] = dataset['review'].apply(remove_tags_urls)

In [9]:
def remove_punctuation(text):
    pattern = re.compile('[^\w\s]')
    text = pattern.sub(r'', text)
    return text

dataset['review'] = dataset['review'].apply(remove_punctuation)

In [10]:
stopwords_list = stopwords.words('english')
dataset['review'] = dataset['review'].apply(lambda x: [item for item in x.split() if item not in stopwords_list]).apply(lambda x: " ".join(x))

In [11]:
dataset['review'].head()

0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: review, dtype: object

In [12]:
x = dataset.iloc[:, 0:1]
y = dataset['sentiment']

#### Enoding categorical columns

In [13]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

#### Splitting the data into train and test set

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

### 1. Bag Of Words Implementation

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [16]:
x_train_bow = vectorizer.fit_transform(x_train['review']).toarray()
x_test_bow = vectorizer.transform(x_test['review']).toarray()

#### Using Naive Bayes Algorithm to train the model

In [17]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

classifier.fit(x_train_bow, y_train)

GaussianNB()

In [18]:
y_pred = classifier.predict(x_test_bow)

#### Accuracy of the model

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)

0.658

In [20]:
confusion_matrix(y_test, y_pred)

array([[374, 139],
       [203, 284]], dtype=int64)

#### Using Random Forest Algorithm to train the model and finding accuracy

In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier()

classifier2.fit(x_train_bow, y_train)

y_pred2 = classifier2.predict(x_test_bow)

accuracy_score(y_test, y_pred2)

0.823

In [22]:
confusion_matrix(y_test, y_pred2)

array([[432,  81],
       [ 96, 391]], dtype=int64)

In [23]:
# More faster code implementation
vectorizer2 = CountVectorizer(max_features=3000)

x_train_bow = vectorizer2.fit_transform(x_train['review']).toarray()
x_test_bow = vectorizer2.transform(x_test['review']).toarray()

classifier2 = RandomForestClassifier()

classifier2.fit(x_train_bow, y_train)
y_pred3 = classifier2.predict(x_test_bow)

accuracy_score(y_test, y_pred3)

0.817

In [24]:
confusion_matrix(y_test, y_pred3)

array([[438,  75],
       [108, 379]], dtype=int64)

### 2. N-Grams Implementation

#### Using Random Forest Algorithm to train the model and finding accuracy for bi-grams

In [25]:
vectorizer3 = CountVectorizer(ngram_range = (1,2), max_features = 3000)

x_train_bow = vectorizer2.fit_transform(x_train['review']).toarray()
x_test_bow = vectorizer2.transform(x_test['review']).toarray()

classifier2 = RandomForestClassifier()

classifier2.fit(x_train_bow, y_train)
y_pred4 = classifier2.predict(x_test_bow)

accuracy_score(y_test, y_pred4)

0.819

In [26]:
confusion_matrix(y_test, y_pred4)

array([[439,  74],
       [107, 380]], dtype=int64)

### 3. TF-IDF Implementation

#### Using Random Forest Algorithm to train the model and finding accuracy

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer4 = TfidfVectorizer()

In [28]:
x_train_tfidf = vectorizer4.fit_transform(x_train['review']).toarray()
x_test_tfidf = vectorizer4.transform(x_test['review']).toarray()

In [29]:
classifier2 = RandomForestClassifier()

classifier2.fit(x_train_tfidf, y_train)
y_pred5 = classifier2.predict(x_test_tfidf)

accuracy_score(y_test, y_pred5)

0.834

In [30]:
confusion_matrix(y_test, y_pred5)

array([[451,  62],
       [104, 383]], dtype=int64)