# Machine Translation con Redes Transformer

Tomado de [Traductor con redes Transformer](https://github.com/codificandobits/Traductor_con_redes_Transformer)



In [31]:
pip install keras-transformer



In [32]:
import numpy as np
from keras_transformer import get_model, decode
from pickle import load
from google.colab import drive
np.random.seed(0)

In [33]:
# Leer set de entrenamiento
drive.mount('/content/drive')
filename = '/content/drive/My Drive/videos/2020-07-06/english-spanish.pkl'

dataset = load(open(filename, 'rb'))
print(dataset[120000,0])
print(dataset[120000,1])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
tom is a new yorker but he doesnt have a new york accent
tom es neoyorquino pero no tiene acento de nueva york


In [34]:
# Crear "tokens"
source_tokens = []
for sentence in dataset[:,0]:
  source_tokens.append(sentence.split(' '))
print(source_tokens[120000])

target_tokens = []
for sentence in dataset[:,1]:
  target_tokens.append(sentence.split(' '))
print(target_tokens[120000])

['tom', 'is', 'a', 'new', 'yorker', 'but', 'he', 'doesnt', 'have', 'a', 'new', 'york', 'accent']
['tom', 'es', 'neoyorquino', 'pero', 'no', 'tiene', 'acento', 'de', 'nueva', 'york']


In [35]:
def build_token_dict(token_list):
  token_dict = {
      '<PAD>': 0,
      '<START>': 1,
      '<END>': 2
  }
  for tokens in token_list:
    for token in tokens:
      if token not in token_dict:
        token_dict[token] = len(token_dict)
  return token_dict

In [36]:
source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v:k for k,v in target_token_dict.items()}

print(source_token_dict)
print(target_token_dict)
print(target_token_dict_inv)

{'<PAD>': 0, '<START>': 1, '<END>': 2, 've': 3, 'vete': 4, 'vaya': 5, 'vayase': 6, 'hola': 7, 'corre': 8, 'corran': 9, 'corra': 10, 'corred': 11, 'quien': 12, 'orale': 13, 'fuego': 14, 'incendio': 15, 'disparad': 16, 'ayuda': 17, 'socorro': 18, 'auxilio': 19, 'salta': 20, 'salte': 21, 'parad': 22, 'para': 23, 'pare': 24, 'espera': 25, 'esperen': 26, 'continua': 27, 'continue': 28, 'date': 29, 'prisa': 30, 'daos': 31, 'dese': 32, 'me': 33, 'oculte': 34, 'escondi': 35, 'ocultaba': 36, 'escondia': 37, 'corri': 38, 'corria': 39, 'lo': 40, 'intento': 41, 'he': 42, 'ganado': 43, 'oh': 44, 'no': 45, 'tomatelo': 46, 'con': 47, 'soda': 48, 'disparen': 49, 'dispara': 50, 'dispare': 51, 'sonrie': 52, 'al': 53, 'ataque': 54, 'atacad': 55, 'ataquen': 56, 'ataca': 57, 'levanta': 58, 'ahora': 59, 'mismo': 60, 'id': 61, 'vayan': 62, 'ya': 63, 'tengo': 64, 'pillas': 65, 'entendiste': 66, 'el': 67, 'corrio': 68, 'metete': 69, 'adentro': 70, 'abrazame': 71, 'preocupo': 72, 'cai': 73, 'hui': 74, 'escape':

In [37]:
# Agregar start, end y pad a cada frase del set de entrenamiento
encoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>'] for tokens in target_tokens]

source_max_len = max(map(len, encoder_tokens))
target_max_len = max(map(len, decoder_tokens))

encoder_tokens = [tokens + ['<PAD>']*(source_max_len-len(tokens)) for tokens in encoder_tokens]
decoder_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in decoder_tokens]
output_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in output_tokens ]

In [38]:
print(encoder_tokens[120000])

['<START>', 'tom', 'is', 'a', 'new', 'yorker', 'but', 'he', 'doesnt', 'have', 'a', 'new', 'york', 'accent', '<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [39]:
encoder_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encoder_tokens]
decoder_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decoder_tokens]
output_decoded = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

print(encoder_input[120000])

[1, 56, 258, 120, 197, 12666, 2914, 32, 1577, 140, 120, 197, 5385, 4287, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [40]:
# Crear la red transformer
model = get_model(
    token_num = max(len(source_token_dict),len(target_token_dict)),
    embed_dim = 32,
    encoder_num = 2,
    decoder_num = 2,
    head_num = 4,
    hidden_dim = 128,
    dropout_rate = 0.05,
    use_same_embed = False,
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Token-Embedding (Embedd [(None, None, 32), ( 808608      Encoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Embedding (TrigPosEmbed (None, None, 32)     0           Encoder-Token-Embedding[0][0]    
__________________________________________________________________________________________________
Encoder-1-MultiHeadSelfAttentio (None, None, 32)     4224        Encoder-Embedding[0][0]          
____________________________________________________________________________________________

In [41]:
# Entrenamiento
x = [np.array(encoder_input), np.array(decoder_input)]
y = np.array(output_decoded)

# model.fit(x,y, epochs=15, batch_size=32)

filename = '/content/drive/My Drive/videos/2020-07-06/translator.h5'
model.load_weights(filename)

In [46]:
def translate(sentence):
  sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
  tr_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in sentence_tokens][0]
  decoded = decode(
      model, 
      tr_input, 
      start_token = target_token_dict['<START>'],
      end_token = target_token_dict['<END>'],
      pad_token = target_token_dict['<PAD>']
  )

  print('Frase original: {}'.format(sentence))
  print('Traducción: {}'.format(' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))))

In [56]:
translate('the day is warm and sunny')

Frase original: the day is warm and sunny
Traducción: el dia esta calentito y sol
