In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
data.iloc[3]['review']

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

## Data Preprocessing

In [5]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
import re
clean = re.compile('<.*?>')
data['review'] = data['review'].apply(lambda x: re.sub(clean, '', x))

In [7]:
def cleanText(text):  
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return cleaned_text

data['review'] = data['review'].apply(cleanText)

In [8]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)  

data['review'] = data['review'].apply(remove_stopwords)

In [9]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem(text):
    stemmed_text = ' '.join([ps.stem(word) for word in text.split()])
    return stemmed_text

data['review'] = data['review'].apply(stem)

In [10]:
data['review']

0        one review mention watch 1 oz episod hook they...
1        a wonder littl product the film techniqu unass...
2        i thought wonder way spend time hot summer wee...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    i thought movi right good job it creativ origi...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    i cathol taught parochi elementari school nun ...
49998    i go disagre previou comment side maltin one t...
49999    no one expect star trek movi high art fan expe...
Name: review, Length: 50000, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1000)
vectors = cv.fit_transform(data['review']).toarray()

In [12]:
cv.get_feature_names_out()

array(['10', '20', '30', '50', '70', '80', '90', 'abil', 'abl', 'absolut',
       'accent', 'accept', 'across', 'act', 'action', 'actor', 'actress',
       'actual', 'ad', 'adapt', 'add', 'admit', 'adult', 'adventur',
       'after', 'age', 'ago', 'agre', 'air', 'alien', 'all', 'allow',
       'almost', 'alon', 'along', 'alreadi', 'also', 'although', 'alway',
       'amaz', 'america', 'american', 'among', 'amount', 'amus', 'an',
       'and', 'anim', 'ann', 'annoy', 'anoth', 'answer', 'anyon', 'anyth',
       'anyway', 'apart', 'appar', 'appeal', 'appear', 'appreci',
       'around', 'art', 'artist', 'as', 'ask', 'aspect', 'at',
       'atmospher', 'attack', 'attempt', 'attent', 'attract', 'audienc',
       'averag', 'avoid', 'aw', 'award', 'away', 'babi', 'back',
       'background', 'bad', 'badli', 'band', 'bare', 'base', 'basic',
       'battl', 'be', 'beat', 'beauti', 'becam', 'becom', 'begin',
       'behind', 'believ', 'best', 'better', 'beyond', 'big', 'bill',
       'bit', 'bla

In [13]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
sentiment = le.fit_transform(data['sentiment'])
sentiment

array([1, 1, 1, ..., 0, 0, 0])

In [15]:
data.drop(columns=['sentiment'], inplace=True)
data['sentiment'] = sentiment
data

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook they...,1
1,a wonder littl product the film techniqu unass...,1
2,i thought wonder way spend time hot summer wee...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1
...,...,...
49995,i thought movi right good job it creativ origi...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,i cathol taught parochi elementari school nun ...,0
49998,i go disagre previou comment side maltin one t...,0


In [16]:
y = data['sentiment']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
vectors_train, vectors_test, y_train, y_test = train_test_split(vectors,y,test_size=0.2,random_state=42)

In [19]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gaussian = GaussianNB()
multinomial = MultinomialNB()
bernouli = BernoulliNB()

gaussian.fit(vectors_train, y_train)
multinomial.fit(vectors_train, y_train)
bernouli.fit(vectors_train, y_train)

In [20]:
gaussian_pred = gaussian.predict(vectors_test)
multinomial_pred = multinomial.predict(vectors_test)
bernouli_pred = bernouli.predict(vectors_test)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
print("Gaussian: ", accuracy_score(y_test, gaussian_pred))
print("Multinomial: ", accuracy_score(y_test, multinomial_pred))
print("Bernouli: ", accuracy_score(y_test, bernouli_pred))

Gaussian:  0.788
Multinomial:  0.8306
Bernouli:  0.8338


In [23]:
bernouli_pred

array([0, 1, 0, ..., 1, 0, 0])