[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BecomeAllan/RNN/blob/main/Lab.ipynb)

# Laboratório

## Implementações com RNNs

### Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import numpy as np
import os
import time
import re

!pip install unidecode
from unidecode import unidecode

tf.config.list_physical_devices('GPU')

## Tratamento de dados

In [None]:
# Baixar dataset
!gdown --id 15FhPHu7Hx6ul_k-EEBZwzpUWznK0gBR3

!gdown --id 1Eq9oi3_1PuSZ5hoZS5M1A5pTMSyy3K_a


In [None]:
# Data1
df1 = pd.read_csv("portuguese-poems.csv", encoding='UTF-8')

# Data2
df2 = pd.read_csv("Ethereum Historical Data.csv", encoding='UTF-8')

df1 = df1.dropna()
df2 = df2.dropna()

df1.head()

In [None]:
data = pd.DataFrame()


Generative_inputs = '\n\n'.join(df1['Title'])


df1['Title'] = df1['Title'].apply(unidecode).str.lower()
df1['Content'] = df1['Content'].apply(unidecode).str.lower()

# Criando os dados

data['Econder_inputs'] = df1['Title']

data['Decoder_inputs'] = df1['Content'].apply(lambda row: "<BOS> " + row[:-1])

data['Decoder_targets'] = df1['Content'].apply(lambda row: row[1:] + " <EOS>")



In [None]:
# Treina os Tokens
tokenizer = Tokenizer(char_level=True, lower=False)

tokenizer.fit_on_texts(data['Econder_inputs'])
tokenizer.fit_on_texts(data['Decoder_targets'])
tokenizer.fit_on_texts(data['Decoder_inputs'])

dictionary = tokenizer.word_index

k = tokenizer.texts_to_sequences(data['Econder_inputs'][0])
k_text = tokenizer.sequences_to_texts(k)

print(dictionary)
print(f'O input: {data["Econder_inputs"][0]}')
print(f'O Token do input: {k}')
print(f'O decode do Token do input: {k_text}')


data['Econder_inputs'] = tokenizer.texts_to_sequences(data['Econder_inputs'])

data['Decoder_inputs'] = tokenizer.texts_to_sequences(data['Decoder_inputs'])

data['Decoder_targets'] = tokenizer.texts_to_sequences(data['Decoder_targets'])

# Seq-Seq

In [None]:
# Treina os Tokens
tokenizer1 = Tokenizer(char_level=True, lower=False)

vocab = sorted(set(Generative_inputs))
char2idx = {u:i for i, u in enumerate(vocab)}

idx2char = np.array(vocab)

In [None]:
def vectorize_string(string):
  vectorized_output = np.array([char2idx[char] for char in string])
  return vectorized_output

def get_batch(vectorized_songs, seq_length, batch_size):
  # the length of the vectorized songs string
  n = vectorized_songs.shape[0] - 1
  # randomly choose the starting indices for the examples in the training batch
  idx = np.random.choice(n-seq_length, batch_size)

  input_batch = [vectorized_songs[i : i+seq_length] for i in idx]
  output_batch = [vectorized_songs[i+1 : i+seq_length+1] for i in idx]

  # x_batch, y_batch provide the true inputs and targets for network training
  x_batch = np.reshape(input_batch, [batch_size, seq_length])
  y_batch = np.reshape(output_batch, [batch_size, seq_length])
  return x_batch, y_batch



X, Y = get_batch(vectorize_string(Generative_inputs), 30, 1000)

In [None]:
Model = keras.Sequential()

Model.add(layers.Embedding(len(vocab), 250, batch_input_shape=[1000, None]))
Model.add(layers.LSTM(100))
Model.add(layers.Dense(len(vocab)))

Model.compile(
    optimizer=keras.optimizers.Adam(1e-1),
    loss = "sparse_categorical_crossentropy",
    metrics = keras.metrics.categorical_accuracy
)

Y.shape

Model.fit(x= X,y=Y, epochs=100)

Model(X)


# Seq2value

In [None]:
df2.head()

from keras.layers.normalization import BatchNormalization

Model = keras.Sequential()

Model.add(layers.LSTM(64, input_shape=(None, 3), return_sequences=True))
Model.add(layers.LSTM(64))
Model.add(layers.Dense(10, activation=keras.activations.elu))
Model.add(BatchNormalization())
Model.add(layers.Dense(1, activation='linear'))

Model.summary()

df2.columns



X_train, X_test, Y_train, Y_test = train_test_split(df2[["Open","Low", 'High']].to_numpy(), df2[['Price']].to_numpy(), test_size = 0.2)

X_train[1]

Model(X_train)

# Encode-Decode (Seq2Seq)

In [None]:
MAX_LENGTH_TITLE = 100
MAX_LENGTH_POEM = 100

data_Econder_inputs = pad_sequences(data['Econder_inputs'], maxlen=MAX_LENGTH_TITLE, padding='post', truncating='post')

data_Decoder_inputs = pad_sequences(data['Decoder_inputs'], maxlen=MAX_LENGTH_POEM, padding='post', truncating='post')

data_Decoder_targets = pad_sequences(data['Decoder_targets'], maxlen=MAX_LENGTH_POEM, padding='post', truncating='post')


Encoder_Train, Encoder_Test, _, _ = train_test_split(data_Econder_inputs, data_Decoder_inputs, test_size = 0.2, random_state=5)
Decoder_Train, Decoder_Test, Target_Train, Target_Test  =  train_test_split(data_Decoder_inputs, data_Decoder_targets, test_size = 0.2, random_state=5)

Target_Train = keras.utils.to_categorical(Target_Train, num_classes=len(dictionary))
Target_Test = keras.utils.to_categorical(Target_Test, num_classes=len(dictionary))


In [None]:
# Length of the vocabulary in chars
vocab_size = len(dictionary)

# The embedding dimension
embedding_dim = 76

# Number of RNN units
rnn_units = 50


## Encoder
encode_input = keras.Input(shape=(None,), name="title")
encode_features = layers.Embedding(vocab_size, embedding_dim)(encode_input) 
encoder = layers.LSTM(rnn_units, return_state=True, name = 'encode')
encode_output, state_h, state_c = encoder(encode_features)

# Estado da celula 
encoder_state = [state_h, state_c]



#layers.CuDNNLSTM

## Decoder
decode_input = keras.Input(shape=(None,), name="content")
decode_features = layers.Embedding(vocab_size, embedding_dim)(decode_input)
decode = layers.LSTM(rnn_units, return_state=True, return_sequences=True, name = 'decode')
decode_out, _, _ = decode(decode_features, initial_state = encoder_state)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(decode_out)

# Estado da celula 


model = keras.Model([encode_input, decode_input], decoder_outputs)
model.summary()

#keras.backend.clear_session()

#keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

model.compile(
    optimizer=keras.optimizers.Adam(1e-1),
    loss = "categorical_crossentropy",
    metrics = keras.metrics.categorical_accuracy
)

#class CustomCallback(keras.callbacks.Callback):
#  def on_epoch_end(self, epoch, logs=None):
#    if 0 == epoch%1: 
#        print(f'Época: {epoch} \n output:{logs["predict"]}')
        

model.fit([Encoder_Train, Decoder_Train], Target_Train,
          batch_size = 3000,
          epochs=2, verbose=2)

pred = model.predict([Encoder_Test[0:2], Decoder_Test[0:2]])

# Predições 
preds = np.argmax(pred[0], axis=1)

np.delete(pred, preds)


enc = tokenizer.sequences_to_texts([ [idx] for idx in Encoder_Test[0].tolist()])
res = tokenizer.sequences_to_texts([ [idx] for idx in pred.tolist()])

print(f'Encode: {"".join(enc)}')
print("".join(res))
