# Exercise 6

In [127]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [128]:
import numpy as np
import pandas as pd

path = "https://github.com/robitussin/CCDEPLRL_EXERCISES/blob/9b8ac1c5683abecc144f0af47eb7cda0688e12b7/dataset/reviews.json?raw=true"

dataset = pd.read_json(path)

In [129]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


In [135]:
sentences = dataset['review'].tolist()
labels = dataset['rating'].tolist()

binary_labels = [1 if r >= 4 else 0 for r in labels]

# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = binary_labels[0:training_size]
testing_labels = binary_labels[training_size:]

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## 1. Tokenize the data

In [136]:
# prompt: tokenize the data that is given

import numpy as np
vocab_size = 5000
embedding_dim = 32
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

## 2. Sequence the data

In [137]:
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)


## 3. Pad the data

In [138]:
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert lists to numpy arrays to work with TensorFlow
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

print(f"Training sequences shape: {training_padded.shape}")
print(f"Testing sequences shape: {testing_padded.shape}")


Training sequences shape: (800, 50)
Testing sequences shape: (201, 50)


## 4. Train a sentiment model

In [139]:
# prompt: train a sentiment model using the data above

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

num_epochs = 30

history = model.fit(training_padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))



Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.5834 - loss: 0.6906 - val_accuracy: 0.2139 - val_loss: 0.7322
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5806 - loss: 0.6831 - val_accuracy: 0.2139 - val_loss: 0.7633
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5798 - loss: 0.6782 - val_accuracy: 0.2139 - val_loss: 0.7813
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5903 - loss: 0.6671 - val_accuracy: 0.2139 - val_loss: 0.7597
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5807 - loss: 0.6656 - val_accuracy: 0.2139 - val_loss: 0.7625
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5866 - loss: 0.6515 - val_accuracy: 0.2388 - val_loss: 0.7456
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━

## Get files for visualing the network

In [140]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(5000, 32)


In [141]:
# prompt: Get files for visualizing the network

import io

# Generate the embedding file
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Get the weights of the embedding layer
weights = model.layers[0].get_weights()[0]

# Iterate through the vocabulary and write to the files
for word, index in tokenizer.word_index.items():
  if index != 0: # skip the OOV token (if it exists and is index 0)
    vec = weights[index]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")

out_v.close()
out_m.close()

In [146]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Predict sentiment with new reviews


In [145]:
# Use the model to predict a review
fake_reviews = ['di gumagana yung product bobo pa nag benta', 'gumagana worth the price', 'maganda pero mabilis masira','sakto lang','maganda ang product pero panget ang seller']

print(fake_reviews)

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

classes = model.predict(fakes_padded)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

# Try adding reviews of your own
# Add some negative words (such as "not") to the good reviews and see what happens
# For example:
# they gave us free chocolate cake and did not charge us

['di gumagana yung product bobo pa nag benta', 'gumagana worth the price', 'maganda pero mabilis masira', 'sakto lang', 'maganda ang product pero panget ang seller']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
di gumagana yung product bobo pa nag benta
[0.33303186]


gumagana worth the price
[0.7535046]


maganda pero mabilis masira
[0.44283268]


sakto lang
[0.56635827]


maganda ang product pero panget ang seller
[0.6212973]


