# Exercise 4

In [4]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
import numpy as np
import pandas as pd

dataset = pd.read_json('reviews.json')


In [6]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


In [7]:

mask = (dataset['rating'] > 0 ) & (dataset['rating'] <4)
column_name = 'rating'
dataset.loc[mask, column_name] = 0

mask = (dataset['rating'] > 3 ) & (dataset['rating'] < 6)
column_name = 'rating'
dataset.loc[mask, column_name] = 1

dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


## 1. Tokenize the data

In [8]:
review = dataset['review'].tolist()
rating = dataset['rating'].tolist()

# Separate out the review and rating into training and test sets
training_size = int(len(review) * 0.8)

training_review = review[0:training_size]
testing_review = review[training_size:]
training_rating = rating[0:training_size]
testing_rating = rating[training_size:]

# Make labels into numpy arrays for use with the network later
training_rating_final = np.array(training_rating)
testing_rating_final = np.array(testing_rating)

In [9]:
# answer here

vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = ""

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_review)
word_index = tokenizer.word_index


## 2. Sequence the data

In [11]:
# answer here
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = tokenizer.texts_to_sequences(training_review)



## 3. Pad the data

In [12]:
# answer here
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_review)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

## 4. Train a sentiment model

In [13]:
# answer here

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size + 1, 200, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(96, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 200)          200200    
                                                                 
 flatten (Flatten)           (None, 20000)             0         
                                                                 
 dense (Dense)               (None, 96)                1920096   
                                                                 
 dense_1 (Dense)             (None, 1)                 97        
                                                                 
Total params: 2120393 (8.09 MB)
Trainable params: 2120393 (8.09 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
num_epochs = 25
model.fit(padded, training_rating_final, epochs=num_epochs, validation_data=(testing_padded, testing_rating_final))


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x79c1aa7396c0>

## Get files for visualing the network

In [15]:
# answer here

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


(1001, 200)


## 5. Predict sentiment with new reviews

In [16]:
# answer here

# Use the model to predict a review
fake_reviews = ['Hindi pareho yung kulay',
                'Kulang yung pinadala scam',
                'Maganda yung material hindi manipis',
                'Okay lang pwede na para sa presyo',
                'Sana may freebie',
                'Mali yung size',
                'May sira yung yung damit',
                'Wag na kayo bibili dito panget',
                'Sobrang ganda nagustuhan ng anak ko bibili ako ulit sa susunod',
                'Pang reregalo ko sana kaso di umabot ang bagal ng shipping']

print(fake_reviews)

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

classes = model.predict(fakes_padded)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

['Hindi pareho yung kulay', 'Kulang yung pinadala scam', 'Maganda yung material hindi manipis', 'Okay lang pwede na para sa presyo', 'Sana may freebie', 'Mali yung size', 'May sira yung yung damit', 'Wag na kayo bibili dito panget', 'Sobrang ganda nagustuhan ng anak ko bibili ako ulit sa susunod', 'Pang reregalo ko sana kaso di umabot ang bagal ng shipping']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

Hindi pareho yung kulay
[0.01144111]


Kulang yung pinadala scam
[0.00041333]


Maganda yung material hindi manipis
[0.95828843]


Okay lang pwede na para sa presyo
[0.99830616]


Sana may freebie
[0.20738398]


Mali yung size
[0.00914251]


May sira yung yung damit
[0.00146853]


Wag na kayo bibili dito panget
[0.11030847]


Sobrang ganda nagustuhan ng anak ko bibili ako ulit sa susunod
[0.9998029]


Pang reregalo ko sana kaso di umabot ang bagal ng shipping
[0.21365428]


