In [176]:
import json
import random

In [232]:
class Sentiment:
    Negative = 'negative'
    Positive = 'positive'
    Neutral = 'neutral'    
    
# self keyword is used for accesing the attributes and the methods of the class
class Review:
    def __init__(self,review,score):
        self.review = review
        self.score = score
        self.sentiment = self.get_sentiment() # hence self.get_sentiment
    
    def get_sentiment(self):
        if(self.score <=2):
            return(Sentiment.Negative)
        elif(self.score > 3):
            return(Sentiment.Positive)
        else:
            return(Sentiment.Neutral)
#######################################################################################################################
        
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    
    
        
    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment == Sentiment.Negative,self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.Positive,self.reviews))
        neutral = list(filter(lambda x : x.sentiment == Sentiment.Neutral,self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        #print(len(positive_shrunk))
        #print(len(negative))
        #print(len(neutral))
        
        
    def get_text(self):
        return [x.review for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]   
        
            

In [146]:
# .loads method takes file contents as string
reviews = []
with open('books_data.json') as f:
    for line in f:
        review = json.loads(line)
        #reviews.append((Review(['reviewText'],review['overall'])))
        reviews.append(Review(review['reviewText'] , review['overall'] ))

In [4]:
from sklearn.model_selection import train_test_split

In [233]:
train,test = train_test_split(reviews,test_size = 0.33, random_state = 42)

In [236]:
train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [253]:
train_container.evenly_distribute()
test_container.evenly_distribute()

In [254]:
X_train = train_container.get_text()
X_test = test_container.get_text()
y_train = train_container.get_sentiment()
y_test = test_container.get_sentiment()

In [255]:
print(Y_train.count(Sentiment.Positive))
print(Y_train.count(Sentiment.Negative))

436
436


In [269]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# now the problem with CountVectorizer is that it weights each word in a sentence equally.
# For example. consider the text This book is so great : CountVectorizer weights This and great as 1 even though the 
# sentiment of the sentence depends on word great.

# A better alternative to CountVectorizer is TfidfVectorizer : Term Frequency Inverse Document Frequency

# RULE-1 Term frequency means that a word is very important if it appears a lot in a text.
# RULE-2 Inverse Document frequnecy means that a word is less important if it appears a lot in the document.
# For example: 1) This book is great , 2) This book was so bad.
# So great would be equally important as this, book, is according to RULE-1
# Words like this, book would be less important as they are appearing quite often in document. RULE 2


In [270]:
#vectorizer = CountVectorizer(binary = True)
vectorizer = TfidfVectorizer()

In [271]:
vectors = vectorizer.fit_transform(X_train)

In [272]:
vectors[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Linear SVM

In [273]:
from sklearn.svm import SVC

In [274]:
model = SVC(kernel = 'linear')

In [275]:
model.fit(vectors,y_train)

SVC(kernel='linear')

In [276]:
test_vectors = vectorizer.transform(X_test)

In [277]:
ans = model.predict(test_vectors)

In [278]:
ans

array(['negative', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'positive', 'negative', 'negative', 'positive', 'negative',
       'positive', 'positive', 'negative', 'positive', 'positive',
       'positive', 'positive', 'positive', 'negative', 'negative',
       'positive', 'negative', 'negative', 'negative', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'positive', 'negative',
       'positive', 'negative', 'positive', 'negative', 'negative',
       'negative', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'negative', 'positive', 'positive',
       'negative', 'positive', 'negative', 'positive', 'positi

In [279]:
y_test

['positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',

In [280]:
model.score(test_vectors,y_test) # score method takes in test data performs predicitions and automatically compares
# it with the labels

0.8076923076923077

In [281]:
from sklearn.metrics import classification_report

In [282]:
classification_report(y_test,ans,output_dict = True)

{'negative': {'precision': 0.8018867924528302,
  'recall': 0.8173076923076923,
  'f1-score': 0.8095238095238094,
  'support': 208},
 'positive': {'precision': 0.8137254901960784,
  'recall': 0.7980769230769231,
  'f1-score': 0.8058252427184466,
  'support': 208},
 'accuracy': 0.8076923076923077,
 'macro avg': {'precision': 0.8078061413244544,
  'recall': 0.8076923076923077,
  'f1-score': 0.807674526121128,
  'support': 416},
 'weighted avg': {'precision': 0.8078061413244544,
  'recall': 0.8076923076923077,
  'f1-score': 0.807674526121128,
  'support': 416}}

In [93]:
# from f1 scores it is clear that the model is pretty bad for predicting neutral and negative reviews but extremely good 
# for predicitng positive reviews

In [285]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [288]:
classifier = GridSearchCV(svm.SVC(gamma='auto'), 
   {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
    }, 
cv=5, return_train_score=False)
classifier.fit(vectors, y_train)
classifier.cv_results_

{'mean_fit_time': array([0.52137427, 0.47486005, 0.53047438, 0.49227152, 0.52942343,
        0.4993978 ]),
 'std_fit_time': array([0.02300979, 0.00781232, 0.00410569, 0.01969557, 0.00945832,
        0.00716577]),
 'mean_score_time': array([0.12374315, 0.11751261, 0.12502694, 0.11243086, 0.12703538,
        0.11474123]),
 'std_score_time': array([0.00841945, 0.00405813, 0.0027327 , 0.00323377, 0.00226709,
        0.00331019]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [289]:
classifier.score(test_vectors,y_test)

0.8028846153846154

### Decision Trees

In [63]:
from sklearn.tree import DecisionTreeClassifier

In [64]:
model = DecisionTreeClassifier()

In [65]:
model.fit(vectors,y_train)

DecisionTreeClassifier()

In [66]:
model.predict(test_vectors)

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [67]:
model.score(test_vectors,y_test)

0.776969696969697

### Naive Bayes

In [68]:
from sklearn.naive_bayes import GaussianNB

In [69]:
model = GaussianNB()

In [70]:
dum = vectors.toarray()

In [71]:
model.fit(dum,y_train)

GaussianNB()

In [72]:
model.predict(test_vectors.toarray())

array(['positive', 'neutral', 'negative', ..., 'negative', 'positive',
       'positive'], dtype='<U8')

In [74]:
model.score(test_vectors.toarray(),y_test)

0.6587878787878788

### Logistic Regression

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
model = LogisticRegression()

In [77]:
model.fit(vectors,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [78]:
ans = model.predict(test_vectors)

In [79]:
ans

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [80]:
test_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [81]:
model.score(test_vectors.toarray(),y_test)

0.8409090909090909

In [20]:
import pandas as pd

In [21]:
data = pd.DataFrame(reviews , columns = ['text','rating'])

In [22]:
data

Unnamed: 0,text,rating
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0
...,...,...
9995,The whole series was great! Melody is a fanta...,5.0
9996,I didn't thing that much of this book. I am a...,3.0
9997,It is an emotional TRIP to the past with Trip ...,5.0
9998,This definitely got under my veins whereby I h...,5.0
