In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [3]:
messages =  pd.read_csv('IMDB Dataset.csv')
# categorical to numerical value
messages['sentiment'].replace({'positive':1, 'negative':0}, inplace  = True)
messages.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

**Cleaning the dataset**

In [6]:
# clean html tags
def clean_html(text):
    clean = re.compile('<*?>')
    return re.sub(clean, '', text)

messages['review'] = messages['review'].apply(clean_html)

In [7]:
# convert to lower
def convert_lower(text):
    return text.lower()

messages['review'] = messages['review'].apply(convert_lower)


In [8]:
# remove special characters
def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

messages['review'] = messages['review'].apply(remove_special)

In [9]:
# remove stopwords
def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y = x[:]
    x.clear()
    return y

messages['review'] = messages['review'].apply(remove_stopwords)
# corpus 

In [10]:
ps = PorterStemmer()
lem = WordNetLemmatizer()

x = []

def stemming(text):
    for i in text:
        x.append(ps.stem(i))
    y=x[:]
    x.clear()
    return y
    

messages['review'] = messages['review'].apply(stemming)
# messages['review'].head()

In [11]:
# join back the list

def join_back(text):
    return ' '.join(text)
    
messages['review'] = messages['review'].apply(join_back)
corpus = messages['review']
# messages['review'].head()

**Count vectorizer & TFID**

In [12]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [13]:
cv = CountVectorizer(ngram_range=(1,3))
tv = TfidfVectorizer(ngram_range=(1,3))
X1 = cv.fit_transform(corpus)
X2 = tv.fit_transform(corpus)

y = messages['sentiment']

**Splitting to train & test set**

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X1_train, X1_test, y_train, y_test = train_test_split(X1,y,test_size=.3,random_state=1)
X2_train, X2_test, y_train, y_test = train_test_split(X2,y,test_size=.3,random_state=1)


**Multinomial Naive Bayes for bag of words and tfidf features**

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
model_bow = MultinomialNB().fit(X1_train, y_train)
model_tfid = MultinomialNB().fit(X2_train, y_train)

y_pred1 = model_bow.predict(X1_test)  #using bag of words
y_pred2 = model_tfid.predict(X2_test) #using tfid

In [18]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [19]:
result_bow =confusion_matrix(y_test,y_pred1)
result_tfid =confusion_matrix(y_test,y_pred2)

print(result_bow)
print(result_tfid)

[[6799  722]
 [ 972 6507]]
[[6708  813]
 [ 845 6634]]


In [21]:
accuracy1 = accuracy_score(y_test,y_pred1)
accuracy2 = accuracy_score(y_test,y_pred2)

print(accuracy1)
print(accuracy2)

0.8870666666666667
0.8894666666666666


**Logistic regression model performane on test dataset**

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model_bow = LogisticRegression(max_iter=800).fit(X1_train, y_train)
model_tfid = LogisticRegression(max_iter=800).fit(X2_train, y_train)

y_pred1 = model_bow.predict(X1_test)  #using bag of words
y_pred2 = model_tfid.predict(X2_test) #using tfid

In [24]:
result_bow =confusion_matrix(y_test,y_pred1)
result_tfid =confusion_matrix(y_test,y_pred2)

print(result_bow)
print(result_tfid)

[[6656  865]
 [ 672 6807]]
[[6485 1036]
 [ 659 6820]]


In [25]:
accuracy1 = accuracy_score(y_test,y_pred1)
accuracy2 = accuracy_score(y_test,y_pred2)

print(accuracy1)
print(accuracy2)

0.8975333333333333
0.887
