In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zeeshan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Pre-processing

In [4]:
# loading the dataset to a pandas DataFrame
df = pd.read_csv(r'D:\ML PROJECTS\EDA and Feature Scaling\IMDB Dataset.csv')

In [5]:
df=df.sample(10000)

In [6]:
df.shape

(10000, 2)

In [7]:
# print the first 5 rows of the dataframe
df.head()

Unnamed: 0,review,sentiment
24367,"Carrot Top's ""Chairman of the Board"" and his A...",negative
46018,Now maybe it had something to do with the fact...,positive
23112,This 1939 film tried to capitalize on the much...,positive
26246,The pakage implies that Warren Beatty and Gold...,negative
17312,I am a music lover and was excited to see this...,negative


In [8]:
# counting the number of missing values in the dataset
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 24367 to 36414
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


as we can see in the review we have html text, special characters which will not helpful for the data
so i have to clean the data with text pre processing

#text cleaning process

##remove html text and special characters 
#stemming
#converting everything to lower case
#removing stop words

In [10]:
df['sentiment'].replace({'positive':1, 'negative':0},inplace= True)
#this will make replace the neg and pos with 0 and 1 respectively

In [11]:
df.head()

Unnamed: 0,review,sentiment
24367,"Carrot Top's ""Chairman of the Board"" and his A...",0
46018,Now maybe it had something to do with the fact...,1
23112,This 1939 film tried to capitalize on the much...,1
26246,The pakage implies that Warren Beatty and Gold...,0
17312,I am a music lover and was excited to see this...,0


In [12]:
import re
clean= re.compile('<.*?>')
re.sub(clean, '', df.iloc[2].review)

'This 1939 film tried to capitalize on the much better Michael Curtiz\'s film "Angels with Dirty Faces". As directed by Ray Enright, the only interesting thing is how tamed these kids were in comparison with what\'s going on with the youth in America\'s inner cities today.The film is only worth seeing because of the presence of Ann Sheridan and Ronald Reagan, who showed they were well paired together. The Dead End kids have larger parts as the plot concentrates on them rather than in the older folks.In a way it\'s curious how arson was used in the same way some scrupulous landlords did in later years right here in New York. It was the quickest way to turn a property around never considering the social problems it created. In today\'s climate with so many guns around there is a new reality. The young kids of the story seemed mere pranksters rather than criminals. How times change!'

this re is a regularization expression library which will remove all the html text from the review attribute

In [13]:
#now i will apply the function which will remove all the html tag from the review attribute

In [14]:
def clean_html(text):
  clean= clean= re.compile('<.*?>')
  return re.sub(clean, '',text)

In [15]:
df.head()

Unnamed: 0,review,sentiment
24367,"Carrot Top's ""Chairman of the Board"" and his A...",0
46018,Now maybe it had something to do with the fact...,1
23112,This 1939 film tried to capitalize on the much...,1
26246,The pakage implies that Warren Beatty and Gold...,0
17312,I am a music lover and was excited to see this...,0


In [16]:
df['review']=df['review'].apply(clean_html)

In [17]:
#converting everything to lower case
def convert_lower(text):
  return text.lower()


In [18]:
df['review']=df['review'].apply(convert_lower)

In [19]:
#function to remove special characters
def remove_special(text):
    x=''
    
    for i in text:
        if i.isalnum():
            x= x + i
        else:
            x= x + ' '
    return x        

In [20]:
df['review']=df['review'].apply(remove_special)

In [21]:
df.head()

Unnamed: 0,review,sentiment
24367,carrot top s chairman of the board and his a...,0
46018,now maybe it had something to do with the fact...,1
23112,this 1939 film tried to capitalize on the much...,1
26246,the pakage implies that warren beatty and gold...,0
17312,i am a music lover and was excited to see this...,0


here what i have done is i convert the special character with space if any in the review feature

In [22]:
#removing the stop words
def remove_stopwords(text):
  x=[]
  for i in text.split():

    if i not in stopwords.words('english'):
      x.append(i)
  y=x[:]
  x.clear()
  return y    

here what i have done is i fetch the text and then split and convert into  it into list if the word is not available in stopwords then i m storing or appending it into empty list in x and then clearing the x and storing it into y for the next text review

In [23]:
df['review']=df['review'].apply(remove_stopwords)

In [24]:
df

Unnamed: 0,review,sentiment
24367,"[carrot, top, chairman, board, commercials, li...",0
46018,"[maybe, something, fact, saw, movie, low, poin...",1
23112,"[1939, film, tried, capitalize, much, better, ...",1
26246,"[pakage, implies, warren, beatty, goldie, hawn...",0
17312,"[music, lover, excited, see, movie, unfortunat...",0
...,...,...
13625,"[looking, forward, seeing, bruce, willis, espe...",0
25992,"[never, ever, see, watching, bad, scifi, movie...",0
20679,"[interested, animal, children, read, many, edg...",0
29596,"[cover, girl, best, musical, rita, hayworth, e...",1


In [25]:
#now i am going to perform stemming
ps = PorterStemmer()

Stemming:

Stemming is the process of reducing a word to its Root word

example: actor, actress, acting --> act

In [26]:
y=[]
def stem_words(text):
  for i in text:
      y.append(ps.stem(i))
  z=y[:]
  y.clear()
  return z

In [27]:
df['review']=df['review'].apply(stem_words)

In [28]:
df

Unnamed: 0,review,sentiment
24367,"[carrot, top, chairman, board, commerci, live,...",0
46018,"[mayb, someth, fact, saw, movi, low, point, li...",1
23112,"[1939, film, tri, capit, much, better, michael...",1
26246,"[pakag, impli, warren, beatti, goldi, hawn, pu...",0
17312,"[music, lover, excit, see, movi, unfortun, dis...",0
...,...,...
13625,"[look, forward, see, bruce, willi, especi, sin...",0
25992,"[never, ever, see, watch, bad, scifi, movi, ye...",0
20679,"[interest, anim, children, read, mani, edger, ...",0
29596,"[cover, girl, best, music, rita, hayworth, eve...",1


In [29]:
#join back
def join_back(list_input):
    return " ".join(list_input)

In [30]:
df['review']=df['review'].apply(join_back)

In [31]:
df

Unnamed: 0,review,sentiment
24367,carrot top chairman board commerci live proof ...,0
46018,mayb someth fact saw movi low point life reall...,1
23112,1939 film tri capit much better michael curtiz...,1
26246,pakag impli warren beatti goldi hawn pull huge...,0
17312,music lover excit see movi unfortun disappoint...,0
...,...,...
13625,look forward see bruce willi especi sinc remem...,0
25992,never ever see watch bad scifi movi year love ...,0
20679,interest anim children read mani edger rice bu...,0
29596,cover girl best music rita hayworth ever made ...,1


now i have used the join back function so my reviews get small letter, stemming, no special character, no html 

In [32]:
#now storing them in a variable
x= df.iloc[:,0:1].values

In [33]:
x.shape

(10000, 1)

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer()

In [35]:
x=cv.fit_transform(df['review']).toarray()

In [36]:
x.shape

(10000, 36098)

In [37]:
x[0].max()
#here i am checking how many times a word came in a sentence here this word came 4 times here

2

In [38]:
y= df.iloc[:,-1].values

In [39]:
y.shape

(10000,)

this is a array where 1 stands for positive and 0 stands for negative

Splitting the dataset to training & test data

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=2)

MultinomialNB implements the naive Bayes algorithm for multinomially distributed data, and is one of the two classic naive Bayes variants used in text classification (where the data are typically represented as word vector counts

Evaluation

In [41]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [42]:
ml=MultinomialNB()
bl=BernoulliNB()

In [43]:
ml.fit(x_train,y_train)

MultinomialNB()

In [44]:
bl.fit(x_train,y_train)

BernoulliNB()

In [45]:
y_pred1=ml.predict(x_test)

In [46]:
y_pred2=bl.predict(x_test)

accuracy score

In [47]:
accuracy_score(y_test, y_pred1)

0.8485

In [48]:
accuracy_score(y_test, y_pred2)

0.848

## Multinomial Evaluation Metrics

In [52]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test,y_pred1))

[[835 153]
 [150 862]]


In [53]:
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       988
           1       0.85      0.85      0.85      1012

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



## Bernoulli Evaluation Metrics

In [57]:
print(confusion_matrix(y_test,y_pred2))

[[846 142]
 [162 850]]


In [56]:
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       988
           1       0.86      0.84      0.85      1012

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [None]:
#i ahve used this confusion matrix because to check how accurately my model is classified 