# Practice 2.2 (Recurrent Neural Networks)

Authors:

1. Ovidio Manteiga Moar
1. Carlos Villar Martínez

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
#import matplotlib.pyplot as plt

In [5]:
#reads a file. Each line has the format: label text
#Returns a list with the text and a list with the labels
def readData(fname):

    with open(fname, 'r', encoding="utf-8") as f:
        fileData = f.read()
  
    lines = fileData.split("\n")
    textData = list()
    textLabel = list()
    lineLength = np.zeros(len(lines))
    
    for i, aLine in enumerate(lines):     
        if not aLine:
            break  
        label = aLine.split(" ")[0]
        lineLength[i] = len(aLine.split(" "))
        if(label == "__label__1"):
            textLabel.append(0)
            textData.append(aLine.lstrip("__label__1 "))

        elif(label == "__label__2"):
            textLabel.append(1)
            textData.append(aLine.lstrip("__label__2 "))

        else:
            print("\nError in readData: ", i, aLine)
            exit()
    
    f.close()
    return textData, textLabel, int(np.average(lineLength)+2*np.std(lineLength))

In [3]:
def transformData(x_train, y_train, x_test, y_test, maxFeatures, seqLength):
    #transforms text input to int input based on the vocabulary
    #max_tokens = maxFeatures is the size of the vocabulary
    #output_sequence_length =  seqLength is the maximum length of the transformed text. Adds 0 is text length is shorter
    precLayer = layers.experimental.preprocessing.TextVectorization(max_tokens = maxFeatures, 
    standardize =  'lower_and_strip_punctuation', split = 'whitespace', output_mode = 'int', 
    output_sequence_length =  seqLength)
    precLayer.adapt(x_train)
    #print(precLayer.get_vocabulary())
    x_train_int = precLayer(x_train)
    y_train = tf.convert_to_tensor(y_train)
    #print(x_train_int)
    #print(y_train)
    x_test_int= precLayer(x_test)
    y_test = tf.convert_to_tensor(y_test)
    #print(x_test_int)
    #print(y_test)

    return x_train_int, y_train, x_test_int, y_test

In [100]:
x_train, y_train, seqLength = readData("./amazon/train_small.txt")
x_test, y_test, tmp = readData("./amazon/test_small.txt")

#Hyperparameters
maxFeatures = 1000
embedding_dim = 64

x_train_int, y_train, x_test_int, y_test = transformData(x_train, y_train, x_test, y_test, maxFeatures, seqLength)


In [115]:
input_shape = (seqLength)
inputs = keras.Input(shape=input_shape)
x = layers.Embedding(input_dim=maxFeatures, output_dim=embedding_dim,
    input_length=seqLength, mask_zero=True)(inputs)
x = layers.GRU(128, activation='tanh', return_sequences=True)(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_29 (InputLayer)       [(None, 166)]             0         
                                                                 
 embedding_27 (Embedding)    (None, 166, 64)           64000     
                                                                 
 gru_15 (GRU)                (None, 166, 128)          74496     
                                                                 
 flatten_1 (Flatten)         (None, 21248)             0         
                                                                 
 dense_21 (Dense)            (None, 64)                1359936   
                                                                 
 dense_22 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,498,497
Trainable params: 1,498,497
Non-tr

In [116]:
callbacks = [ keras.callbacks.ModelCheckpoint("jena_gru_amazon.keras") ]
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=["accuracy"])
history = model.fit(x_train_int, y_train, epochs=20,
                    batch_size=256, validation_data=(x_test_int, y_test), 
                    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [112]:
max_val_accuracy = max(history.history['val_accuracy'])
print("MAX TEST ACC = {mva:.2f}%".format(mva=max_val_accuracy*100))

MAX TEST ACC = 87.52%
