In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
import re
import os
import csv

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data=pd.read_csv('IMDBDataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data["review"]

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [5]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

### Text Cleaning

In [6]:
data = data.sample(10000)

In [7]:
data.shape

(10000, 2)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 45543 to 12111
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [9]:
data["sentiment"].replace({"positive": 1, "negative": 0}, inplace=True)

In [10]:
data.head()

Unnamed: 0,review,sentiment
45543,"""Markham,"" says urbane gentleman crime-solver ...",1
17173,The worst film I have seen in the last 12 mont...,0
47141,I first watched this movie back in the mid/lat...,0
34300,It's difficult to decide whether this movie su...,0
20711,One of the major successes to The Decline of W...,1


In [11]:
# Remove html tag
def clean_html(text):
    clean = re.compile("<.*?>")
    return re.sub(clean, '', text)

In [12]:
data['review']=data['review'].apply(clean_html)

In [13]:
data["review"]

45543    "Markham," says urbane gentleman crime-solver ...
17173    The worst film I have seen in the last 12 mont...
47141    I first watched this movie back in the mid/lat...
34300    It's difficult to decide whether this movie su...
20711    One of the major successes to The Decline of W...
                               ...                        
11651    I feel like I've just watched a snuff film.......
43879    This is the ultimate, and I mean the ULTIMATE,...
10922    Snap, crackle, pop! The jarring sound of every...
14612    The video box for 'Joyride' says "starring sec...
12111    I generally love this type of movie. However, ...
Name: review, Length: 10000, dtype: object

In [14]:
# COnvert to lowercase
def convert_lower(text):
    return text.lower()

In [15]:
data['review']=data['review'].apply(convert_lower)

In [16]:
data["review"]

45543    "markham," says urbane gentleman crime-solver ...
17173    the worst film i have seen in the last 12 mont...
47141    i first watched this movie back in the mid/lat...
34300    it's difficult to decide whether this movie su...
20711    one of the major successes to the decline of w...
                               ...                        
11651    i feel like i've just watched a snuff film.......
43879    this is the ultimate, and i mean the ultimate,...
10922    snap, crackle, pop! the jarring sound of every...
14612    the video box for 'joyride' says "starring sec...
12111    i generally love this type of movie. however, ...
Name: review, Length: 10000, dtype: object

In [17]:
# Remove square barckets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

In [18]:
data['review']=data['review'].apply(remove_between_square_brackets)
data["review"]

45543    "markham," says urbane gentleman crime-solver ...
17173    the worst film i have seen in the last 12 mont...
47141    i first watched this movie back in the mid/lat...
34300    it's difficult to decide whether this movie su...
20711    one of the major successes to the decline of w...
                               ...                        
11651    i feel like i've just watched a snuff film.......
43879    this is the ultimate, and i mean the ultimate,...
10922    snap, crackle, pop! the jarring sound of every...
14612    the video box for 'joyride' says "starring sec...
12111    i generally love this type of movie. however, ...
Name: review, Length: 10000, dtype: object

In [19]:
def denoise_text(text):
    text = clean_html(text)
    text = convert_lower(text)
    text = remove_between_square_brackets(text)
    return text

In [20]:
data['review']=data['review'].apply(denoise_text)
data["review"]

45543    "markham," says urbane gentleman crime-solver ...
17173    the worst film i have seen in the last 12 mont...
47141    i first watched this movie back in the mid/lat...
34300    it's difficult to decide whether this movie su...
20711    one of the major successes to the decline of w...
                               ...                        
11651    i feel like i've just watched a snuff film.......
43879    this is the ultimate, and i mean the ultimate,...
10922    snap, crackle, pop! the jarring sound of every...
14612    the video box for 'joyride' says "starring sec...
12111    i generally love this type of movie. however, ...
Name: review, Length: 10000, dtype: object

In [21]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [22]:
data['review']=data['review'].apply(remove_special_characters)
data["review"]

45543    markham says urbane gentleman crimesolver phil...
17173    the worst film i have seen in the last 12 mont...
47141    i first watched this movie back in the midlate...
34300    its difficult to decide whether this movie suf...
20711    one of the major successes to the decline of w...
                               ...                        
11651    i feel like ive just watched a snuff filma bea...
43879    this is the ultimate and i mean the ultimate a...
10922    snap crackle pop the jarring sound of every ch...
14612    the video box for joyride says starring second...
12111    i generally love this type of movie however th...
Name: review, Length: 10000, dtype: object

In [23]:
remove_special_characters("i have seen this play many times from olivier#$%&*%%*.")

'i have seen this play many times from olivier'

In [24]:
# Remove stopwords
def remove_stop_word(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words("english"):
            x.append(i)
    y=x[:]
    x.clear()
    return y


In [25]:
remove_stop_word("i have seen i'm i've if in into is isn't it it's its itself let's play many times from olivier")

['seen', "i'm", "i've", "let's", 'play', 'many', 'times', 'olivier']

In [None]:
data['review']=data['review'].apply(remove_stop_word)
data["review"]

In [None]:
data

In [None]:
y=[]
ps=PorterStemmer()
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [None]:
data['review']=data['review'].apply(stem_words)
data["review"]

In [None]:
def join_words(list):
    return " ".join(list)

In [None]:
data['review']=data['review'].apply(join_words)
data["review"]

In [None]:
X =data.iloc[:,0:1].values

In [None]:
X.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
cv=CountVectorizer(max_features =800)
X=cv.fit_transform(data["review"]).toarray()


In [None]:
X.shape

In [None]:
X[0].max()

In [None]:
y =data.iloc[:,-1].values

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB()

In [None]:
print(model1.fit(X_train, y_train))
print(model2.fit(X_train, y_train))
print(model3.fit(X_train, y_train))

In [None]:
y_pred1 =model1.predict(X_test)
y_pred2 =model2.predict(X_test)
y_pred3 =model3.predict(X_test)

In [None]:
print("Gaussian:", model1.score(X_test, y_test))
print("Multinomial:", model2.score(X_test, y_test))
print("Bernoulli:", model3.score(X_test, y_test))