# First: Giving opinion based on comment 

### Loading data without using classes

In [2]:
import json

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line) 
        reviews.append((review['reviewText'], review['overall']))
reviews[4]

('It was a decent read.. typical story line. Nothing unsavory as so many are. Just a slice of life, plausible.',
 3.0)

### Organising code with using classes

In [92]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.score = score
        self.text = text
        self.sentiment = self.get_sentiment() # opinia
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # score = 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
            
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        
#         print(negative[0].text)
#         print(len(negative))
#         print(len(positive))

### Loading data

In [51]:

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall'])) # dodajemy obiekty klasy Review
                       
#print(reviews[6][1]) # if we do not use classes and we want to get to score ore text
#reviews[6].sentiment
reviews[6].sentiment
# reviews[6].score
# reviews[6].text

'NEGATIVE'

In [5]:
len(reviews)

10000

### Data preparation

In [93]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state =42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

cont.evenly_distribute()

len(cont.reviews)

872

In [53]:
len(training)

6700

In [54]:
print(training[2].text)

One of Francine Rivers best series books!


In [55]:
print(training[0].sentiment)

POSITIVE


In [101]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size = 0.33, random_state =42)

train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.NEGATIVE)
train_y.count(Sentiment.POSITIVE)

436

#### Bugs of word vectorization


In [102]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# train_x_vector = vectorizer.fit_transform(train_x) # otrzymujemy ogromną macierz

# we can do above in 2 steps, as below
vectorizer.fit(train_x) # fit the model

train_x_vectors = vectorizer.transform(train_x) # transform into vector
test_x_vectors =vectorizer.transform(test_x)

print(train_x[2])
print(train_x_vectors[2])






I loved this book. Schubert writes amazing characters that show depth and compassion. Definitely earned it's place in my 'best of' list of books
  (0, 392)	1
  (0, 423)	1
  (0, 862)	1
  (0, 991)	1
  (0, 996)	1
  (0, 1362)	1
  (0, 1594)	1
  (0, 2073)	1
  (0, 2134)	1
  (0, 2528)	1
  (0, 4034)	1
  (0, 4277)	1
  (0, 4711)	1
  (0, 4784)	1
  (0, 5260)	1
  (0, 5478)	2
  (0, 5900)	1
  (0, 6907)	1
  (0, 7121)	1
  (0, 7925)	1
  (0, 7976)	1
  (0, 8840)	1


### Clasification

#### Linear SVM

In [103]:
from sklearn.svm import SVC

clf_svm = SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict

test_x[2]

clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [104]:
from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

test_x[2]

clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [105]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
# clf_gnb.fit(train_x_vectors, train_y) # coś mi tu nie działa!!!!

# clf_gnb.predict(test_x_vectors[0])

#### Logistic Regression

In [106]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

### Evaluation

In [107]:
# Mean Accuracy
clf_svm.score(test_x_vectors, test_y) 

0.7124242424242424

In [108]:
clf_dec.score(test_x_vectors, test_y)

0.6175757575757576

In [109]:
clf_log.score(test_x_vectors, test_y)

0.7448484848484849

In [113]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels = [Sentiment.NEUTRAL Sentiment.POSITIVE, Sentiment.NEGATIVE])
# f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

# all models work good on positive comment but not very well on negative and neutral

array([0.        , 0.85363477, 0.28146853])

In [111]:
train_y[0:5]

['POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE']

In [112]:
train_y.count(Sentiment.NEGATIVE)

436