In [1]:
import tensorflow as tf

In [2]:
import json

In [3]:
import sklearn # nlp , feature extraction

# making classes

In [4]:
class Sentiment:
    NEGATIVE="negative"
    POSITIVE="positive"
    NEUTRAL="neutral"
class Review:
    def __init__(self, text, score):
        self.text=text
        self.score=score 
        self.sentiment=self.get_Sentiment()
    def get_Sentiment(self):
        if self.score<=2:
            return Sentiment.NEGATIVE
        elif self.score==3:
            return Sentiment.NEUTRAL
        elif self.score>3:
            return Sentiment.POSITIVE

# loading the data

In [8]:
file_name="sentiment_project_data/Books_small_10000.json"
reviews=[]
with open(file_name) as f:
    for line in f:
        #print(line)
        review=json.loads(line)
#         print(review["reviewText"])
#         print(review["overall"])
        reviews.append(Review(review["reviewText"], review["overall"]))
#print(reviews[71][1])
reviews[150].sentiment    

'neutral'

In [9]:
len(reviews)

10000

# data prep

In [40]:
from sklearn.model_selection import train_test_split
training,test= train_test_split(reviews, test_size=0.33, random_state=0)

In [44]:
len(training)


6700

In [45]:
len(test)

3300

In [41]:
training[1000].text

'This was a free book and thought it would be cute to read well let me tell you that it is a good read Cora Seton knows how to set a story up as you read this you start to fall for these people and this place it would make for a great rainy day read can not wait to read more'

In [42]:
training[1000].score

5.0

In [46]:
training[1000].sentiment

'positive'

In [47]:
x_train=[x.text for x in training]
y_train=[x.sentiment for x in training]

x_train[10]
y_train[10]

'positive'

In [48]:
x_test=[x.text for x in test]
y_test=[x.sentiment for x in test]

In [49]:
x_test[1000]

'I think Grumpy cat is really cute, I enjoy reading and looking through it. I love cats. I highly recommend this book'

# cleaning the data

In [50]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [51]:
ps=PorterStemmer()
wl=WordNetLemmatizer()
def cleaning(data):
    refined=[]
    for i in range(len(data)):
        review=re.sub('[^a-zA-Z]'," ", data[i])
        review=review.lower()
        review=review.split()
        review=[wl.lemmatize(word) for word in review if not word in stopwords.words("english")]
        review=" ".join(review)
        refined.append(review)
    return refined
x_train=cleaning(x_train)
x_test=cleaning(x_test)



In [52]:
x_train[1000]


'free book thought would cute read well let tell good read cora seton know set story read start fall people place would make great rainy day read wait read'

In [53]:
x_test[1000]

'think grumpy cat really cute enjoy reading looking love cat highly recommend book'

# tfidf vectorization

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=16000)
train_vectored=vectorizer.fit_transform(x_train)
test_vectored=vectorizer.fit_transform(x_test)


In [55]:
train_vectored, test_vectored

(<6700x16000 sparse matrix of type '<class 'numpy.float64'>'
 	with 268298 stored elements in Compressed Sparse Row format>,
 <3300x16000 sparse matrix of type '<class 'numpy.float64'>'
 	with 135131 stored elements in Compressed Sparse Row format>)

# classifiers...

# 1. Using svm (worked the best here)

In [56]:
from sklearn import svm
classifier_svm=svm.SVC(kernel='linear',gamma='auto')
classifier_svm.fit(train_vectored, y_train)



SVC(gamma='auto', kernel='linear')

In [57]:
x_train[307]
# print(len(x_train))

'book written question answer format depth author list reference web site contact number one would need follow advise give good job mentioning area consider looking retirement finance book read start seriously approach retirement age reading book understand better best time draw social security age give benefit section would really benefit one fifty sense showing area start investing thing consider retirement age finance concern lighting concept rule work read book bottom line book good one reference book shelf received book free blogging book review'

In [59]:
x_test[117]

'passion bright light glory compilation best sermon various speaker passion year story insight passionate believer like john piper beth moore francis chan even louie giglio fill page inspiring word challenging next generation rise real jesus isaiah set stage passion mission name renown jesus christ would desire soul going act desire read book could say people around oh gosh good read good chapter chapter continued blown away encouragement one page deep challenge next conviction one page grace next chapter deeply challenged twisted brain awesome knot thank john piper also really enjoyed diversity teacher grateful passion individual lord jesus christ book fantastic excellent read anyone looking grow faith challenged live theology emboldened weight gospel give passion light glory star received book free charge book look blogger exchange honest review'

In [61]:
final_decision=classifier_svm.predict(test_vectored)
final_decision

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [62]:
final_decision[307]

'positive'

# let's make a confusion matrix for a better idea of the prediction

In [63]:
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(y_test, final_decision)
confusion

array([[   0,    0,  226],
       [   0,    0,  309],
       [   0,    0, 2765]], dtype=int64)

In [64]:
#to check the accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(final_decision,y_test)
accuracy*100

83.78787878787878

array([[   9,   16,  103],
       [  51,   74,  543],
       [ 148,  235, 2121]], dtype=int64)

68.33333333333333

# 2. using decision tree classifier

In [69]:
from sklearn.tree import DecisionTreeClassifier
classifier_dct=DecisionTreeClassifier()
classifier_dct.fit(train_vectored, y_train)

DecisionTreeClassifier()

In [70]:
valid_output=classifier_dct.predict(test_vectored)
valid_output

array(['neutral', 'neutral', 'positive', ..., 'neutral', 'neutral',
       'neutral'], dtype='<U8')

In [71]:
valid_output[307]

'neutral'

In [72]:
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(valid_output, y_test)
confusion

array([[  15,    6,  108],
       [ 182,  263, 2323],
       [  29,   40,  334]], dtype=int64)

In [73]:
#to check the accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(valid_output,y_test)
accuracy*100

18.545454545454547

 # 3. using naive bayes

In [75]:
from sklearn.naive_bayes import MultinomialNB
nm=MultinomialNB()
nm.fit(train_vectored, y_train)

MultinomialNB()

In [76]:
naive_output=nm.predict(test_vectored)
naive_output

array(['positive', 'positive', 'positive', ..., 'neutral', 'neutral',
       'positive'], dtype='<U8')

In [78]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(naive_output, y_test)
confusion

array([[   7,   10,   52],
       [  76,  111,  798],
       [ 143,  188, 1915]], dtype=int64)

In [84]:
#to check the accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(naive_output,y_test)
accuracy*100

61.60606060606061

In [85]:
#comparing accuracy
print(classifier_svm.score(test_vectored,y_test))
print(classifier_dct.score(test_vectored,y_test))
print(nm.score(test_vectored,y_test))


0.8378787878787879
0.18545454545454546
0.6160606060606061
