# First: Giving opinion based on comment 

### Loading data without using classes

In [43]:
import json

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line) 
        reviews.append((review['reviewText'], review['overall']))
reviews[4]

('It was a decent read.. typical story line. Nothing unsavory as so many are. Just a slice of life, plausible.',
 3.0)

### Organising code with using classes

In [44]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.score = score
        self.text = text
        self.sentiment = self.get_sentiment() # opinia
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # score = 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
            
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        
#         print(negative[0].text)
#         print(len(negative))
#         print(len(positive))

### Loading data

In [45]:

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall'])) # dodajemy obiekty klasy Review
                       
#print(reviews[6][1]) # if we do not use classes and we want to get to score ore text
#reviews[6].sentiment
reviews[6].sentiment
# reviews[6].score
# reviews[6].text

'NEGATIVE'

In [46]:
len(reviews)

10000

### Data preparation

In [47]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state =42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

# cont.evenly_distribute()

# len(cont.reviews)

In [48]:
len(training)

6700

In [49]:
print(training[2].text)

One of Francine Rivers best series books!


In [50]:
print(training[0].sentiment)

POSITIVE


In [51]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size = 0.33, random_state =42)

train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.POSITIVE))

436
436


In [52]:
len(train_container.reviews)

872

#### Bugs of word vectorization


In [64]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
# train_x_vector = vectorizer.fit_transform(train_x) # otrzymujemy ogromną macierz

# we can do above in 2 steps, as below
vectorizer.fit(train_x) # fit the model

train_x_vectors = vectorizer.transform(train_x) # transform into vector
test_x_vectors =vectorizer.transform(test_x)

print(train_x[2])
print(train_x_vectors[2])



# TfidfVectorizer skip words like so, this, was, is etc. because they do not add any additional information


I  like the couple in this story. I also really like the siblings in the story, I'm sure I want to know more about Jax. That said, I barely made it through. This book was 200 pages of bickering. I was invested in the couple and book 1 ends with a cliffhanger so of course I bought book 2. I have no plans to read 3.Both of these people have emotional intelligence issues. Seriously they act way younger than their age. He is the dominant rich man, but not really. She is the wonderful girl next door, but a nag. Blah.
  (0, 8881)	0.14099402382213577
  (0, 8781)	0.11467125845613337
  (0, 8760)	0.049115039148090844
  (0, 8627)	0.07896449958428199
  (0, 8608)	0.09061593543330497
  (0, 8589)	0.08893809387584807
  (0, 8052)	0.06819238770889387
  (0, 8005)	0.08106451053286465
  (0, 7976)	0.0703445486727966
  (0, 7956)	0.07351684604950251
  (0, 7954)	0.08994087061495956
  (0, 7935)	0.08555801779807404
  (0, 7929)	0.18818641202089786
  (0, 7925)	0.046647161805728546
  (0, 7919)	0.08704067953867456
 

### Clasification

#### Linear SVM

In [65]:
from sklearn.svm import SVC

clf_svm = SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict

test_x[2]

clf_svm.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

#### Decision Tree

In [66]:
from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

test_x[2]

clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [56]:
from sklearn.naive_bayes import MultinomialNB

clf_gnb = MultinomialNB()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [67]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

### Evaluation

In [68]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y) )
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))

0.8076923076923077
0.6610576923076923
0.8052884615384616
0.7932692307692307


In [69]:
# F1 Scores (za dużo pozytywnych a za mało negatywnych i neutralnych opinii)
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])
# f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])
# f1_score(test_y, clf_gnb.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

# all models work good on positive comment but not very well on negative and neutral
# as we distributed test values evenly the output is higher also for negative 

array([0.80582524, 0.80952381])

In [70]:
train_y[0:5]

['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE']

In [71]:
test_y.count(Sentiment.POSITIVE)

208

### we need to add another class which help to evenly distribute positive and negative sentiments

In [72]:
test_set  =['I enjoyed it, 5 stars', 'bad book, so not but', 'horrible, waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Grid search

In [75]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16)}

svc = SVC()
clf = GridSearchCV(svc, parameters, cv =5)
clf.fit(train_x_vectors, train_y)

# when ypu do not now what parameters to chose for a clasifier, use gridsearch wnd this function finds the best parameters values

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [76]:
print(clf_svm.score(test_x_vectors, test_y))

0.8076923076923077


### Model Saving - pickle


#### Save Model

In [78]:
# if we have clasifier which we trained and do not want to train it one more time then we can save it
import pickle

with open('./Sci-kit-learn-first-Project.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Load Model

In [80]:
with open('./Sci-kit-learn-first-Project.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [81]:
print(test_)
loaded_clf.predict(test_x_vectors[1])

array(['NEGATIVE'], dtype='<U8')