<a href="https://colab.research.google.com/github/FranciscoBPereira/MEI-AID/blob/main/MEI_AID_ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setup, Version check and Common imports

# Python ≥3.10 is required
import sys
assert sys.version_info >= (3, 10)


# TensorFlow ≥2.10 is required
import tensorflow as tf
assert tf.__version__ >= "2.10"

# Common imports
import numpy as np
import os

from tensorflow import keras
from tensorflow.keras import layers

# to make this notebook's output stable across runs
np.random.seed(42)

import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

In [None]:
# Load IMDB dataset from keras datasets: https://keras.io/api/datasets/imdb/
# Information is preprocessed and ready to use

tf.random.set_seed(42)

max_features = 10000    # Only the most common max_feature words are kept
common_words = 10       # Skips the top common_words most common words

# The load_data() method creates train and test sets.
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=max_features, skip_top=common_words)

# It retrieves a dict mapping words to their index in the IMDB dataset.
word_index = keras.datasets.imdb.get_word_index()

In [None]:
# Visualization of a few reviews, both encoded and as a raw text
# Labels: 0(Bad), 1(Good)

# Choose a review
review = 0

print("Review Length: " ,len(x_train[review]))
print(x_train[review])

tam = len(x_train[review])
print('Label ', y_train[review])


id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in x_train[review][:tam]])

In [None]:
# Cut reviews to enhance efficiency (by default, words are cut at the beginning)

# Sentiment analysis predictions will be made just considering the last words of the review
#https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

maxlen = 100

x_trainP = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_testP = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [None]:
# Visualization of a few reviews, after the cut

# Choose a review

review = 0
tam = len(x_trainP[review])

print('Length ', tam)
print('Label ', y_train[review])
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in x_trainP[review][:tam]])


In [None]:
### MODEL A

# A straightforward feedforward neural network

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

modelA = keras.Sequential([
    layers.Flatten(input_shape=[maxlen, 1]),
    layers.Dense(20),
    layers.Dense(20),
    layers.Dense(1, activation="sigmoid")
])

In [None]:
modelA.summary()

In [None]:
# Compilation and Training

modelA.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

historyA = modelA.fit(x_trainP, y_train, epochs=10, validation_split=0.2)

In [None]:
# ModelA Performance on the Test set

modelA.evaluate(x_testP, y_test)

In [None]:
# Plot the evolution of the accuracy metrics

import pandas as pd

x = pd.DataFrame(historyA.history, columns = ['accuracy', 'val_accuracy'])
x.plot(figsize=(8, 5))
plt.grid(True)
plt.show()

In [None]:
### MODEL B

# Add a raw (untrained) embedding layer
# https://www.tensorflow.org/tutorials/text/word_embeddings
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

# Embedding dimension
output_emb = 20

modelB = keras.Sequential([
    keras.Input(shape=[maxlen]),
    layers.Embedding(max_features,  output_emb),
    layers.Flatten(),
    layers.Dense(20),
    layers.Dense(20),
    layers.Dense(1, activation="sigmoid")
])

In [None]:
modelB.summary()

In [None]:
# Compilation and Training

modelB.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

historyB = modelB.fit(x_trainP, y_train, epochs=10, validation_split=0.2)

In [None]:
# ModelB Performance on the Test set

modelB.evaluate(x_testP, y_test)

In [None]:
# Plot the evolution of the accuracy metrics

import pandas as pd

x = pd.DataFrame(historyB.history, columns = ['accuracy', 'val_accuracy'])
x.plot(figsize=(8, 5))
plt.grid(True)
plt.show()

In [None]:
### MODEL C

# Replace the feedforward architecture by a recurrent neural network with LSTM cells

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)


# Embedding dimension
output_emb = 20

modelC = keras.Sequential([
    keras.Input(shape=[maxlen]),
    layers.Embedding(max_features,  output_emb),
    layers.SimpleRNN(20, return_sequences=True),
    layers.SimpleRNN(20),
    layers.Dense(1, activation="sigmoid")
])


In [None]:
modelC.summary()

In [None]:
# Compilation and Training

modelC.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

historyC = modelC.fit(x_trainP, y_train, epochs=10, validation_split=0.2)

In [None]:
# ModelC Performance on the Test set

modelC.evaluate(x_testP, y_test)

In [None]:
# Plot the evolution of the accuracy metrics

import pandas as pd

x = pd.DataFrame(historyC.history, columns = ['accuracy', 'val_accuracy'])
x.plot(figsize=(8, 5))
plt.grid(True)
plt.show()