# IMDB Movies Review Dataset

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv('imdb_labelled.txt',delimiter='\t',quoting=3)

In [4]:
dataset

Unnamed: 0,Reviews,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


## Creating the bag of words model`

In [6]:
import re
import nltk

In [7]:
#nltk.download('stopwords') #  Uncomment this line if stopwords is not downloaded
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
corpus = []
for i in range(dataset.shape[0]):
    review = re.sub('[^A-z]', ' ' ,dataset['Reviews'][i]).lower().split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Label']

## Train test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Classification using Gaussian NB model

In [11]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [12]:
y_pred = classifier.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.675
[[71 15]
 [50 64]]


## Classification using Decision Tree model

In [33]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=None)
classifier.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [34]:
y_pred = classifier.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.715
[[60 26]
 [31 83]]


## Classification using Random Forest model

In [66]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=120)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [67]:
y_pred = classifier.predict(X_test)

In [68]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.78
[[68 18]
 [26 88]]


## Classification using Gradient Boosting model

In [114]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(n_estimators=250,learning_rate=0.75)
classifier.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.75, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=250,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [115]:
y_pred = classifier.predict(X_test)

In [116]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.78
[[69 17]
 [27 87]]
