In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# Loading Dataset
df = pd.read_csv('IMDB Dataset.csv')
df = df.sample(50000)

In [3]:
df.head()

Unnamed: 0,review,sentiment
18256,"okay, my question; who's the idiot that wrote ...",negative
45850,World At War is perhaps the greatest documenta...,positive
36971,It was just a terrible movie. No one should wa...,negative
39930,I just saw this film at the phoenix film festi...,positive
9972,"After watching this movie once, it quickly bec...",positive


In [4]:
# df['review'][0]

Text Cleaning:
- Removing HTML tags
- Removing speacial characters
- Converting uppercase to lowercase
- removing Stop words (e.g., and, is, the, etc)
- Stemming words (i.e., words like play, playing and played are grouped as one)

In [6]:
df.shape

(50000, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 18256 to 38219
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [8]:
df.sentiment.replace('positive', 1, inplace=True)
df.sentiment.replace('negative', 0, inplace=True)

In [9]:
df.head()

Unnamed: 0,review,sentiment
18256,"okay, my question; who's the idiot that wrote ...",0
45850,World At War is perhaps the greatest documenta...,1
36971,It was just a terrible movie. No one should wa...,0
39930,I just saw this film at the phoenix film festi...,1
9972,"After watching this movie once, it quickly bec...",1


In [10]:
# Function to clean html tags
import re
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
df.review = df.review.apply(clean_html)

In [11]:
def remove_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

df.review = df.review.apply(remove_special)

In [12]:
def to_lower(text):
    return text.lower()

df.review = df.review.apply(to_lower)

In [13]:
# remove the stop words
import nltk
from nltk.corpus import stopwords

In [14]:
df['review'][49997]

'i am a catholic taught in parochial elementary schools by nuns  taught by jesuit priests in high school   college  i am still a practicing catholic but would not be considered a  good catholic  in the church s eyes because i don t believe certain things or act certain ways just because the church tells me to so back to the movie   its bad because two people are killed by this nun who is supposed to be a satire as the embodiment of a female religious figurehead  there is no comedy in that and the satire is not done well by the over acting of diane keaton  i never saw the play but if it was very different from this movies then it may be good at first i thought the gun might be a fake and the first shooting all a plan by the female lead of the four former students as an attempt to demonstrate sister mary s emotional and intellectual bigotry of faith  but it turns out the bullets were real and the story has tragedy   the tragedy of loss of life  besides the two former students   the lives

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

df.review = df.review.apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\igmsp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\igmsp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [16]:
df.head()

Unnamed: 0,review,sentiment
18256,"[okay, question, idiot, wrote, movie, giving, ...",0
45850,"[world, war, perhaps, greatest, documentary, s...",1
36971,"[terrible, movie, one, waste, time, go, see, s...",0
39930,"[saw, film, phoenix, film, festival, today, lo...",1
9972,"[watching, movie, quickly, became, one, favori...",1


In [17]:
# Perform stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [18]:
y = []
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z = y[:]
    y.clear()
    return z

stem_words(['I','loved','loving','it'])

['i', 'love', 'love', 'it']

In [19]:
df.review = df.review.apply(stem_words)

In [20]:
# join back
def join_back(list_input):
    return " ".join(list_input)

df.review = df.review.apply(join_back)

In [21]:
df.review

18256    okay question idiot wrote movi give name dean ...
45850    world war perhap greatest documentari seri tim...
36971    terribl movi one wast time go see someth els m...
39930    saw film phoenix film festiv today love synops...
9972     watch movi quickli becam one favorit differ ev...
                               ...                        
26311    american movi war nazi simpli good refrain bec...
36925    okay sound one like tremend mental gimp sat fi...
15726    movi one longest movi watch experi life like d...
43933    huge jane austen fan order movi amazon uk coul...
38219    mani peopl know mexican cinema poor call golde...
Name: review, Length: 50000, dtype: object

In [378]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 6500)
# max_feature = 10000 will choose the first 10000 most used words and make them part of ur analysis

In [379]:
X = cv.fit_transform(df.review).toarray()
X.shape # A 2D array where each inner array represents a review and each word represents

(50000, 6500)

In [380]:
# X = pd.DataFrame(X)
# X.head()
# #converting datatype of each column to achieve memory efficiency
# def convert_int64_to_int32(df):
#     # Iterate through each column
#     for col in df.columns:
#         if df[col].dtype == 'int64':  # Check if the column's data type is int64
#             df[col] = df[col].astype('int32')  # Convert to int32
#     return df

# X = convert_int64_to_int32(X)

# X.dtypes

In [381]:
y = df.iloc[:,-1].values

In [382]:
# y = pd.DataFrame(y)
y.shape

(50000,)

In [383]:
# splitting training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=9)

In [384]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [385]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [None]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [None]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("Gaussian: {}%".format(accuracy_score(y_test, y_pred1)*100))
print("Multinomial: {}%".format(accuracy_score(y_test, y_pred2)*100))
print("Bernaulli: {}%".format(accuracy_score(y_test, y_pred3)*100))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

cv = CountVectorizer(max_features = 7000)
# max_feature = 10000 will choose the first 10000 most used words and make them part of ur analysis
X = cv.fit_transform(df.review).toarray()
X.shape # A 2D array where each inner array represents a review and each word represents
y = df.iloc[:,-1].values
# y = pd.DataFrame(y)
y.shape
# splitting training and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=9)
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()