# IMDB Sentiment Analysis

## Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split as tts

In [2]:
data = pd.read_csv("IMDB-Dataset.csv")

## Understanding the Dataset

In [3]:
# top values of the data-set
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# shape of the data
data.shape

(50000, 2)

In [5]:
# column names 
data.columns

Index(['review', 'sentiment'], dtype='object')

In [6]:
# count of unique values in the column
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [7]:
# top 10 elements of the dataset
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [8]:
# data from the bottom
data.tail(5)

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


## Splitting the Data

In [9]:
x = data.iloc[0:,0].values
y = data.iloc[0:,1].values

In [10]:
xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.25,random_state = 42)

## Extracting the features

In [11]:
cv = CountVectorizer()
xtrain = cv.fit_transform(xtrain)

## Building the Model

In [12]:
classifier = MultinomialNB()
classifier.fit(xtrain,ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
xtest = cv.transform(xtest)
ypred = classifier.predict(xtest)

In [14]:
# model score
accuracy_score(ypred,ytest)

0.84944

In [15]:
# confusion matrix
A=confusion_matrix(ytest,ypred)
print(A)

[[5430  727]
 [1155 5188]]


In [16]:
# f1 score
recall=A[0][0]/(A[0][0]+A[1][0])
precision=A[0][0]/(A[0][0]+A[0][1])
F1=2*recall*precision/(recall+precision)
print(F1)

0.852299482027939
