# Real_world_project

## Class of Data

In [1]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"


class Review:
    def __init__(self,text,score):
        self.text = text 
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score >= 4:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # Score of 2 or 1
            return Sentiment.NEGATIVE
        
class ReviewContainer:
    def __init__(self,Review):
        self.Review = Review
    
    def get_text(self):
        return [x.text for x in self.Review]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.Review]
    
    def evenlyDistribute(self):
        
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE,self.Review))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE,self.Review))
        positive = positive[:len(negative)]
        
        self.Review = positive + negative
        random.shuffle(self.Review)
        

## Load Data

In [2]:
import json
file_name = 'books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))


## Prep Data 

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state = 42)

training_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [4]:
training_container.evenlyDistribute()
train_x = training_container.get_text()
train_y = training_container.get_sentiment()

test_container.evenlyDistribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


### Bag of words vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vec = vectorizer.fit_transform(train_x)
test_x_vec = vectorizer.transform(test_x)
#vectorizer.fit(train_x)
#train_x_vec = vectorizer.transform(train_x)

print(train_x_vec[0].toarray())

[[0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear SVM

In [6]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vec, train_y)

print(test_x[7])

clf_svm.predict(test_x_vec[7])

You can find all this info out in one well written magazine article found on LinkedIn. Do a search instead.


array(['NEGATIVE'], dtype='<U8')

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vec,train_y)

clf_dec.predict(test_x_vec[5])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vec.toarray(),train_y)

clf_gnb.predict(test_x_vec[5].toarray())

array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression


clf_log = LogisticRegression()

clf_log.fit(train_x_vec,train_y)

clf_log.predict(test_x_vec[5])

array(['POSITIVE'], dtype='<U8')

### Evalution

In [10]:
#Mean Accuracy
print(clf_svm.score(test_x_vec,test_y))
print(clf_dec.score(test_x_vec,test_y))
print(clf_gnb.score(test_x_vec.todense(),test_y))
print(clf_log.score(test_x_vec,test_y))

0.8076923076923077
0.6466346153846154
0.6610576923076923
0.8052884615384616


In [11]:
# F1 scores

from sklearn.metrics import f1_score 

print(f1_score(test_y,clf_svm.predict(test_x_vec),labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE],average = None))
print(f1_score(test_y,clf_dec.predict(test_x_vec),labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE],average = None))
print(f1_score(test_y,clf_gnb.predict(test_x_vec.toarray()),labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE],average = None))
print(f1_score(test_y,clf_svm.predict(test_x_vec),labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE],average = None))

[0.80952381 0.80582524]
[0.64916468 0.6440678 ]
[0.66508314 0.65693431]
[0.80952381 0.80582524]


In [12]:
test_y.count(Sentiment.POSITIVE)

208

In [13]:
test_set = ['This book is good','I really like this book good','I hate it']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

In [14]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rfb'), 'C':(1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters,cv = 5)

clf.fit(train_x_vec,train_y)

Traceback (most recent call last):
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 217, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 285, in _sparse_fit
    kernel_type = self._sparse_kernels.index(kernel)
ValueError: 'rfb' is not in list

Traceback (most recent call last):
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 217, i

Traceback (most recent call last):
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 217, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 285, in _sparse_fit
    kernel_type = self._sparse_kernels.index(kernel)
ValueError: 'rfb' is not in list

Traceback (most recent call last):
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 217, i

Traceback (most recent call last):
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 217, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 285, in _sparse_fit
    kernel_type = self._sparse_kernels.index(kernel)
ValueError: 'rfb' is not in list

Traceback (most recent call last):
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\fidan_pc\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 217, i

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rfb')})

In [15]:
print(clf.score(test_x_vec,test_y))

0.8076923076923077


### Saving Model

In [16]:
import pickle

with open('sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf,f)
    

### Load Model

In [17]:
with open('sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)

In [31]:
loaded_clf.predict(test_x_vec[0])

array(['NEGATIVE'], dtype='<U8')