#### Utility Classes

In [1]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score

    def get_sentiment(self):
        if self.score > 3: return Sentiment.POSITIVE
        else: return Sentiment.NEGATIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiments(self):
        return [x.get_sentiment() for x in self.reviews]

    def evenly_distribute(self):
        negatives = list(filter(lambda x: x.get_sentiment() == Sentiment.NEGATIVE, self.reviews))
        positives = list(filter(lambda x: x.get_sentiment() == Sentiment.POSITIVE, self.reviews))
        positives = positives[:len(negatives)]
        self.reviews = negatives + positives
        random.shuffle(self.reviews)

#### Load data

In [2]:
import json

import json

file_name = './data/reviews.json'
reviews = []
with open(file_name, errors='ignore') as f:
    reviews = json.load(f)

reviewsObjects = []
for review in reviews:
    reviewsObjects.append(Review(review['review'], float(review['rating'])))

print('Sample size:', len(reviewsObjects))

Sample size: 1001


#### Data prep

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviewsObjects, test_size=0.2)

training_cont = ReviewContainer(training)
test_cont = ReviewContainer(test)

training_cont.evenly_distribute()
test_cont.evenly_distribute()

X_train = training_cont.get_text()
y_train = training_cont.get_sentiments()

X_test = test_cont.get_text()
y_test = test_cont.get_sentiments()

print('Training size:', len(X_train))
print('Test size:', len(X_test))

Training size: 800
Test size: 200


#### Bag of words vectorization

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

print('Training vectors shape:', X_train_vectors.shape)

Training vectors shape: (800, 3047)


#### Classification

##### Linear SVM

In [5]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(X_train_vectors, y_train)

SVC(kernel='linear')

##### Decision tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(X_train_vectors, y_train)
clf_dec.predict(X_test_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Evaluation

In [7]:
# SVM
svm_score = clf_svm.score(X_test_vectors, y_test)
dec_score = clf_dec.score(X_test_vectors, y_test)

from sklearn.metrics import f1_score
f1_score(y_test, clf_svm.predict(X_test_vectors), average=None, labels=[Sentiment.NEGATIVE, Sentiment.POSITIVE])

array([0.81188119, 0.80808081])

#### Model tuning using Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train_vectors, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [9]:
clf.score(X_test_vectors, y_test)

print('Accuracy:', clf.score(X_test_vectors, y_test))

Accuracy: 0.81


#### Saving the model

In [10]:
import pickle

with open('./saved_models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

##### Load model

In [19]:
with open('./saved_models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

def classify(text):
    text_vector = vectorizer.transform([text])
    return loaded_clf.predict(text_vector)

print('Classify Tagalog: ', classify('ang ganda mo'))

Classify Tagalog:  ['POSITIVE']
