In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('imdb.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
#one review
df['review'][23]

'First of all, let\'s get a few things straight here: a) I AM an anime fan- always has been as a matter of fact (I used to watch Speed Racer all the time in Preschool). b) I DO like several B-Movies because they\'re hilarious. c) I like the Godzilla movies- a lot.<br /><br />Moving on, when the movie first comes on, it seems like it\'s going to be your usual B-movie, down to the crappy FX, but all a sudden- BOOM! the anime comes on! This is when the movie goes WWWAAAAAYYYYY downhill.<br /><br />The animation is VERY bad & cheap, even worse than what I remember from SPEED RACER, for crissakes! In fact, it\'s so cheap, one of the few scenes from the movie I "vividly" remember is when a bunch of kids run out of a school... & it\'s the same kids over & over again! The FX are terrible, too; the dinosaurs look worse than Godzilla. In addition, the transition to live action to animation is unorganized, the dialogue & voices(especially the English dub that I viewed) was horrid & I was begging 

# TEXT CLEANING
1.sampling 10000 rows
2.remove html TAGS 
3.remove special characters
4.converting every thing to lower case
5.removing stop words
6.stemming


In [5]:
df = df.sample(10000)

In [6]:
df.shape

(10000, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 9604 to 16155
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [8]:
df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)

In [9]:
df.head()

Unnamed: 0,review,sentiment
9604,Big (and we mean plus sized big) baddie Sebast...,0
33523,I saw this movie previewed before something el...,0
6478,"Paris, JE T'AIME is a wondrous cinematic homag...",1
15678,Mike Nichols in finest form. I was not a fan o...,1
15531,I saw this movie at the 2005 Toronto Internati...,1


# removing the html tags

In [10]:
import re

In [11]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean ,'',text)

In [12]:
df['review'] = df['review'].apply(clean_html)

In [13]:
def convert_lower(text):
    return text.lower()

In [14]:
df['review'] = df['review'].apply(convert_lower)

In [15]:
#function to remove special characters
def remove_special(text):
    x = []
    for i in text:
        if i.isalnum():
            x.append(i)
        else:
            x.append(' ')
    return ''.join(x)

In [16]:
df['review'] = df['review'].apply(remove_special)

In [17]:
#removing stop words
import nltk

In [18]:
from nltk.corpus import stopwords

In [19]:
df

Unnamed: 0,review,sentiment
9604,big and we mean plus sized big baddie sebast...,0
33523,i saw this movie previewed before something el...,0
6478,paris je t aime is a wondrous cinematic homag...,1
15678,mike nichols in finest form i was not a fan o...,1
15531,i saw this movie at the 2005 toronto internati...,1
...,...,...
14867,as an adventure mini series this is about as ...,1
13022,i used to watch this too at junior school in p...,1
24791,from the mind of harry alan towers comes anoth...,0
43552,i really like kinski he is a great actor i ve...,0


In [20]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def remove_stopwords(text):
    x = []
    stop_words = set(stopwords.words('english'))
    
    for i in text.split():
        if i.lower() not in stop_words:  
            x.append(i)
    
    y = x[:]  # Create a copy of x
    x.clear()  # Clear x (not necessary, but keeping it as per your request)
    
    return y  # Returning y instead of a string


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
df

Unnamed: 0,review,sentiment
9604,big and we mean plus sized big baddie sebast...,0
33523,i saw this movie previewed before something el...,0
6478,paris je t aime is a wondrous cinematic homag...,1
15678,mike nichols in finest form i was not a fan o...,1
15531,i saw this movie at the 2005 toronto internati...,1
...,...,...
14867,as an adventure mini series this is about as ...,1
13022,i used to watch this too at junior school in p...,1
24791,from the mind of harry alan towers comes anoth...,0
43552,i really like kinski he is a great actor i ve...,0


In [26]:
from nltk.stem import PorterStemmer
import pandas as pd

ps = PorterStemmer()

def stem_words(text):
    return [ps.stem(word) for word in text.split()]  # Using list comprehension

def join_back(list_input):
    return ' '.join(list_input) 
# Apply stemming to the 'review' column
df['review'] = df['review'].apply(lambda x: join_back(stem_words(x)))

In [27]:
df['review']

9604     big mean plu size big baddi sebastian cabot tr...
33523    saw movi preview someth els rent back look dec...
6478     pari je aim wondrou cinemat homag citi light c...
15678    mike nichol finest form fan closer refresh see...
15531    saw movi 2005 toronto intern film festiv base ...
                               ...                        
14867    adventur mini seri good get view origin shown ...
13022    use watch junior school petersfield hampshir a...
24791    mind harri alan tower come anoth piec cinemat ...
43552    realli like kinski great actor seen movi heard...
16155    one last classic french new wave direct cineas...
Name: review, Length: 10000, dtype: object

In [28]:
X = df.iloc[: , 0:1].values

In [33]:
X.shape

(10000, 1)

In [131]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1100)

In [132]:
X = cv.fit_transform(df['review']).toarray()

In [133]:
X.shape

(10000, 1100)

In [134]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [135]:
X[0].mean()

0.14363636363636365

In [136]:
y = df.iloc[: , -1].values

In [137]:
y.shape

(10000,)

In [138]:
# X,y
# Training set
# Test Set(already know the result)

In [139]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2)

In [140]:
X_train.shape

(8000, 1100)

In [141]:
X_test.shape

(2000, 1100)

In [142]:
y_train.shape

(8000,)

In [143]:
y_test.shape

(2000,)

In [144]:
from sklearn.naive_bayes import GaussianNB , MultinomialNB , BernoulliNB

In [145]:
c1f1 = GaussianNB()
c1f2 = MultinomialNB()
c1f3 = BernoulliNB()

In [146]:
 c1f1.fit(X_train , y_train)
 c1f2.fit(X_train , y_train)
 c1f3.fit(X_train , y_train)

In [151]:
y_pred1 = c1f1.predict(X_test)
y_pred2 = c1f2.predict(X_test)
y_pred3 = c1f3.predict(X_test)

In [152]:
y_pred1.shape

(2000,)

In [153]:
from sklearn.metrics import accuracy_score

In [154]:
print('Gaussian', accuracy_score(y_test, y_pred1))
print('Multinomial', accuracy_score(y_test, y_pred2))
print('Bernoulli', accuracy_score(y_test, y_pred3))


Gaussian 0.7945
Multinomial 0.846
Bernoulli 0.847
