## DATA CLASS

In [9]:
import random

class Sentiment:
        NEGATIVE = 'NEGATIVE'
        NEUTRAL = 'NEUTRAL'
        POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
            negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
            positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
            positive_shrunk = positive[:len(negative)]
            self.reviews = negative + positive_shrunk
            random.shuffle(self.reviews)
           

### Load Data

In [10]:
import json

file_name = 'C:/Users/Bartosz Baszniak/Desktop/Python/Data/books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].sentiment

'POSITIVE'

### Prep Data

In [101]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state = 42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()


In [102]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.NEGATIVE)


436

### Bag of words

In [113]:
# ALgorytm przypisujcy 0 i 1 do słow - tworzy macierz w ktorej kolumny sa tworzone 
#przez kazde slowo wiersze o dane zdanie w ktorym jest zapisywana 1 kiedy dane slowo pojawia się w zdaniu

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)


test_x_vectors =  vectorizer.transform(test_x)




### Classification

#### Linear svm

In [127]:
from sklearn.svm import SVC

clf_svm = SVC(kernel = 'linear', )

clf_svm.fit(train_x_vectors, train_y)


clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [115]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vectors, train_y)

clf_tree.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Naive bayes

In [116]:
from sklearn.naive_bayes import GaussianNB

clf_NB = GaussianNB()
clf_NB.fit(train_x_vectors.todense(), train_y)

clf_NB.predict(test_x_vectors[0].todense())


array(['POSITIVE'], dtype='<U8')

#### Logistic reg

In [117]:
from sklearn.linear_model import LogisticRegression


clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])



array(['POSITIVE'], dtype='<U8')

### Evaluation 

#### Mean Accuracy 

In [118]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_tree.score(test_x_vectors, test_y))
print(clf_NB.score(test_x_vectors.todense(), test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6466346153846154
0.6610576923076923
0.8028846153846154


##### Note
accuracy - how many labels are predict correctly 
ważniejszy parametr do oceny modelu kiedy robimy klasyfikacje jest F1 score


#### F1 score

In [119]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_tree.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))


[0.80582524 0.80952381]
[0.6405868  0.65248227]


In [120]:
# Model jest bardzo zły dla Negative i Neutral  ---> powód, malo danych dla tych dwóch grup, duża większość to pozytywne

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.NEUTRAL))

436
436
0


In [121]:
test_set = ['Wouldn\'t say worth to buy', 'not worth', 'great story']
test_set = vectorizer.transform(test_set)

print(clf_svm.predict(test_set))

['NEGATIVE' 'NEGATIVE' 'POSITIVE']


### Tuning our model(with grid search)

In [129]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm


parameters = {'kernel':('linear', 'rbf'), 'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)







GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [130]:
print(clf.score(test_x_vectors, test_y))

0.8076923076923077


### Saving the Model

In [132]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load saved model

In [133]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [135]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

Really enjoyed this story and it was very well done as I could not put it down so anxious to see what unfolded next. Loved that part of the story took place in Nogales, AZ where I used to live. Also interesting to read about the border crossing  issues.


array(['POSITIVE'], dtype='<U8')