In [39]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import pickle
import unicodedata


In [2]:
data = pd.read_csv('book_reviews.csv')
data.head()

Unnamed: 0,text,label
0,Fascinating view into the past: This book is f...,__label__2
1,A very amusing look at the past: I was hoping ...,__label__2
2,Didn't like...: I didn't like this version of ...,__label__1
3,"Two Divas,that should shut up!: Here are two s...",__label__1
4,Movie Version is Much better!: The movie versi...,__label__1


In [3]:
data.shape

(2808, 2)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2808 entries, 0 to 2807
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2808 non-null   object
 1   label   2808 non-null   object
dtypes: object(2)
memory usage: 44.0+ KB


In [5]:
data['label'].value_counts()

__label__1    1538
__label__2    1270
Name: label, dtype: int64

In [6]:
data['label'].replace('__label__1',0,inplace=True)
data['label'].replace('__label__2',1,inplace=True)

In [7]:
data.head()

Unnamed: 0,text,label
0,Fascinating view into the past: This book is f...,1
1,A very amusing look at the past: I was hoping ...,1
2,Didn't like...: I didn't like this version of ...,0
3,"Two Divas,that should shut up!: Here are two s...",0
4,Movie Version is Much better!: The movie versi...,0


In [12]:
data['text'][2]

'Didn\'t like...: I didn\'t like this version of the song, "when you believe," even though it was the original. I favored the orchestral rendition of this song which was beautifully arranged by Hans Zimmer.'

In [15]:
#remove htmls
def clean(text):
    cleaned = re.compile(r'<.*?')
    return re.sub(cleaned,'',text)

data.review = data.text.apply(clean)
data.text[0]

  data.review = data.text.apply(clean)


"Fascinating view into the past: This book is fascinating for anyone at all interested in New Zealand history or a general account of life in the past. It is surprisingly well written for what is just a series of letters, and gives a real insight into life as a pioneer in the 1860s, with all its highs and lows. You'll laugh at the author's first attempts at baking, and cry with her as she digs dead lambs from a snowdrift. It's a wonderful book and I heartily recommend it."

In [17]:
#remove special charcaters
def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

data.text = data.text.apply(is_special)
data.text[2]

'Didn t like     I didn t like this version of the song   when you believe   even though it was the original  I favored the orchestral rendition of this song which was beautifully arranged by Hans Zimmer '

In [19]:
#remove lowercase
def to_lower(text):
    return text.lower()

data.text = data.text.apply(to_lower)
data.text[2]

'didn t like     i didn t like this version of the song   when you believe   even though it was the original  i favored the orchestral rendition of this song which was beautifully arranged by hans zimmer '

In [75]:
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data.text = data.text.apply(rem_stopwords)
data.text[0]

['f',
 'c',
 'n',
 'n',
 'g',
 'v',
 'e',
 'w',
 'n',
 'h',
 'e',
 'p',
 'h',
 'b',
 'k',
 'f',
 'c',
 'n',
 'n',
 'g',
 'f',
 'r',
 'n',
 'n',
 'e',
 'l',
 'l',
 'n',
 'e',
 'r',
 'e',
 'e',
 'n',
 'n',
 'e',
 'w',
 'z',
 'e',
 'l',
 'n',
 'h',
 'r',
 'r',
 'g',
 'e',
 'n',
 'e',
 'r',
 'l',
 'c',
 'c',
 'u',
 'n',
 'f',
 'l',
 'f',
 'e',
 'n',
 'h',
 'e',
 'p',
 'u',
 'r',
 'p',
 'r',
 'n',
 'g',
 'l',
 'w',
 'e',
 'l',
 'l',
 'w',
 'r',
 'e',
 'n',
 'f',
 'r',
 'w',
 'h',
 'j',
 'u',
 'e',
 'r',
 'e',
 'f',
 'l',
 'e',
 'e',
 'r',
 'n',
 'g',
 'v',
 'e',
 'r',
 'e',
 'l',
 'n',
 'g',
 'h',
 'n',
 'l',
 'f',
 'e',
 'p',
 'n',
 'e',
 'e',
 'r',
 'n',
 'h',
 'e',
 '1',
 '8',
 '6',
 '0',
 'w',
 'h',
 'l',
 'l',
 'h',
 'g',
 'h',
 'n',
 'l',
 'w',
 'u',
 'l',
 'l',
 'l',
 'u',
 'g',
 'h',
 'h',
 'e',
 'u',
 'h',
 'r',
 'f',
 'r',
 'e',
 'p',
 'b',
 'k',
 'n',
 'g',
 'n',
 'c',
 'r',
 'w',
 'h',
 'h',
 'e',
 'r',
 'h',
 'e',
 'g',
 'e',
 'l',
 'b',
 'f',
 'r',
 'n',
 'w',
 'r',
 'f',
 'w'

In [76]:
#stem the words
def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

data.text = data.text.apply(stem_txt)
data.text[0]

'f c n n g v e w n h e p h b k f c n n g f r n n e l l n e r e e n n e w z e l n h r r g e n e r l c c u n f l f e n h e p u r p r n g l w e l l w r e n f r w h j u e r e f l e e r n g v e r e l n g h n l f e p n e e r n h e 1 8 6 0 w h l l h g h n l w u l l l u g h h e u h r f r e p b k n g n c r w h h e r h e g e l b f r n w r f w n e r f u l b k n h e r l r e c e n'

In [77]:
data.head()

Unnamed: 0,text,label
0,f c n n g v e w n h e p h b k f c n n g f r n ...,1
1,v e r u n g l k h e p w h p n g f r f r r e e ...,1
2,n l k e n l k e h v e r n f h e n g w h e n u ...,0
3,w v h h u l h u u p h e r e r e w n g b r w h ...,0
4,v e v e r n u c h b e e r h e v e v e r n f h ...,0


In [78]:
#create the model
#binary classifier
X = np.array(data.iloc[:,0].values)

In [79]:
y = np.array(data['label'].values)

In [80]:
cv = CountVectorizer(max_features = 1000,lowercase=False)

In [82]:
X = cv.fit_transform(data.review).toarray()

In [83]:
X.shape

(2808, 1000)

In [84]:
y.shape

(2808,)

In [85]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [86]:
trainx,testx,trainy,testy = train_test_split(X,y,test_size=0.2,random_state=0)


In [87]:
trainx.shape

(2246, 1000)

In [88]:
trainy.shape

(2246,)

In [89]:
testx.shape

(562, 1000)

In [90]:
testy.shape

(562,)

In [91]:
gnb = GaussianNB()

In [92]:
gnb.fit(trainx,trainy)

GaussianNB()

In [93]:
mnb = MultinomialNB(alpha=1.0,fit_prior=True)

In [94]:
mnb.fit(trainx,trainy)

MultinomialNB()

In [95]:
bnb = BernoulliNB(alpha=1.0,fit_prior=True)

In [96]:
bnb.fit(trainx,trainy)

BernoulliNB()

In [97]:
ypg = gnb.predict(testx)

In [98]:
ypm = mnb.predict(testx)

In [99]:
ypb = bnb.predict(testx)

In [100]:
print("Gaussian =",accuracy_score(testy,ypg))

Gaussian = 0.7544483985765125


In [101]:
print("Multinomial =",accuracy_score(testy,ypm))

Multinomial = 0.8078291814946619


In [102]:
print("Bernouli =",accuracy_score(testy,ypb))

Bernouli = 0.8202846975088968


In [104]:
pickle.dump(bnb,open('model1.pkl','wb'))

In [105]:
rev = "Fascinating view into the past: This book is fascinating for anyone at all interested in New Zealand history or a general account of life in the past. It is surprisingly well written for what is just a series of letters, and gives a real insight into life as a pioneer in the 1860s, with all its highs and lows. You'll laugh at the author's first attempts at baking, and cry with her as she digs dead lambs from a snowdrift. It's a wonderful book and I heartily recommend it."

In [106]:
f1 = clean(rev)

In [107]:
f2 = is_special(f1)

In [108]:
f3 = to_lower(f2)

In [109]:
f4 = rem_stopwords(f3)

In [110]:
f5 = stem_txt(f4)

In [111]:
bow, words = [], word_tokenize(f5)
for word in words:
    bow.append(words.count(word))
    
word_dict = cv.vocabulary_
pickle.dump(word_dict,open('bow.pkl','wb'))

In [112]:
inp = []
for i in word_dict:
    inp.append(f5.count(i[0]))
y_pred = bnb.predict(np.array(inp).reshape(1,1000))

In [113]:
print(y_pred)

[0]


In [None]:
#This book review is negative