In [4]:
import zipfile
import os

caminho_zip = './dataset.zip'
diretorio_destino = './'

# Certifique-se de que o diretório de destino exista ou crie-o
if not os.path.exists(diretorio_destino):
    os.makedirs(diretorio_destino)

# Descompactar o arquivo .zip
with zipfile.ZipFile(caminho_zip, 'r') as zip_ref:
    zip_ref.extractall(diretorio_destino)

print(f'A pasta {caminho_zip} foi descompactada em {diretorio_destino}.')

A pasta ./dataset.zip foi descompactada em ./.


## Uma Rede Codificador-Decodificador para Tradução Neural de Máquina


In [5]:
!pip install pathlib



In [6]:
import os
import numpy as np
from pathlib import Path
import tensorflow as tf
import numpy as np
from pathlib import Path
import tensorflow as tf

In [7]:
import pandas as pd

df = pd.read_csv('eng-por.txt', sep='\t')
df.columns = ['Ingles', 'Portugues', 'descatar']
df = df.drop(columns=['descatar'])
df.columns = ['', '']
df = df.reset_index(drop=True)
df.to_csv('dataset.txt', index=False, sep='\t')

In [8]:
path = str(os.getcwd())

text = (Path(path) / "dataset.txt").read_text()

pairs = [line.split("\t") for line in text.splitlines()]

np.random.shuffle(pairs)

sentences_en, sentences_es = zip(*pairs) # separates the pairs into 2 lists


for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

The station was deserted. => A estação estava deserta.
We are in the era of atomic energy. => Estamos na era da energia nuclear.
Can I borrow your car? => Posso pegar emprestado o teu carro?


In [9]:
vocab_size = 1000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
 vocab_size, output_sequence_length=max_length)

text_vec_layer_es = tf.keras.layers.TextVectorization(
 vocab_size, output_sequence_length=max_length)

In [10]:
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])
text_vec_layer_en.get_vocabulary()[:10]
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'tom', 'que', 'o', 'não', 'eu', 'de']

In [11]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])


In [12]:
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)


In [13]:
embed_size = 128

encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
 mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
 mask_zero=True)

In [14]:
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [15]:
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [27]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
 outputs=[Y_proba])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
 metrics=["accuracy"])

model.fit((X_train, X_train_dec), Y_train, epochs=10,
 validation_data=((X_valid, X_valid_dec), Y_valid))

model.save("modelo_tradutor", save_format='tf')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
def translate(sentence_en):
     translation = ""
     for word_idx in range(max_length):
         X = np.array([sentence_en]) # encoder input
         X_dec = np.array(["startofseq " + translation]) # decoder input
         y_proba = model.predict((X, X_dec))[0, word_idx] # last token's probas
         predicted_word_id = np.argmax(y_proba)
         predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
         if predicted_word == "endofseq":
             break
         translation += " " + predicted_word
     return translation.strip()

In [29]:
translate("I like soccer")




'gosto de futebol'

In [51]:
vetor = ['I like to eat', 'They are going', 'My children is good', 'I dont like my family', 'I dont know read']

In [49]:
for frase in vetor:
    print(translate(frase))

gosto de comer
elas estão indo
meus filhos são bons
eu não gosto da minha família
eu não sei ler


In [31]:
modelo_pt = tf.keras.models.load_model("modelo_tradutor")