### Loading data without using classes

In [7]:
import json

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line) 
        reviews.append((review['reviewText'], review['overall']))
reviews[4]

('It was a decent read.. typical story line. Nothing unsavory as so many are. Just a slice of life, plausible.',
 3.0)

### Organising code with using classes

In [3]:
class Review:
    def __init__(self, text, score):
        self.score = score
        self.text = text
        self.sentiment = self.get_sentiment() # opinia
        
    def get_sentiment(self):
        if self.score <= 2:
            return 'Negative'
        elif self.score == 3:
            return 'Neutral'
        else: # score = 4 or 5
            return 'Positive'
        

### Loading data

In [16]:

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall'])) # dodajemy obiekty klasy Review
                       
#print(reviews[6][1]) # if we do not use classes and we want to get to score ore text
#reviews[6].sentiment
reviews[6].sentiment
reviews[6].score
# reviews[6].text

2.0

In [17]:
len(reviews)

10000

### Data preparation

In [18]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state =42)

In [21]:
len(training)

6700

In [24]:
print(training[2].text)

One of Francine Rivers best series books!


In [25]:
print(training[0].sentiment)

Positive


In [26]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size = 0.33, random_state =42)


train_x = [x.text for x in training] # we splited into x and y to separate text from sentiment
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

print(train_y[2])
print(train_x[2])

Positive
One of Francine Rivers best series books!


#### Bugs of word vectorization


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# train_x_vector = vectorizer.fit_transform(train_x) # otrzymujemy ogromną macierz

# we can do above in 2 steps, as below
vectorizer.fit(train_x) # fit the model

train_x_vectors = vectorizer.transform(train_x) # transform into vector
test_x_vectors =vectorizer.transform(test_x)

print(train_x[2])
print(train_x_vector[2])






One of Francine Rivers best series books!
  (0, 16595)	1
  (0, 16686)	1
  (0, 2711)	1
  (0, 9642)	1
  (0, 20146)	1
  (0, 21105)	1
  (0, 3161)	1


### Clasification

#### Linear SVM

In [36]:
from sklearn.svm import SVC

clf_svm = SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict

test_x[2]

clf_svm.predict(test_x_vectors[0])


array(['Positive'], dtype='<U8')

#### Decision Tree

In [46]:
from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

test_x[2]

clf_dec.predict(test_x_vectors[0])


array(['Positive'], dtype='<U8')

#### Naive Bayes

In [55]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
# clf_gnb.fit(train_x_vectors, train_y) # coś mi tu nie działa!!!!

# clf_gnb.predict(test_x_vectors[0])

#### Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array(['Positive'], dtype='<U8')

### Evaluation