In [None]:
pip install keras-transformer



In [None]:
import numpy as np
from keras_transformer import get_model, decode
from pickle import load
from google.colab import drive
np.random.seed(0)

In [None]:
drive.mount('/content/drive')
filename = '/content/drive/My Drive/transformers/english-spanish.pkl'

dataset = load(open(filename, 'rb'))
print(dataset[11000,0])
print(dataset[11000,1])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
i bumped my knee
me di un golpe en la rodilla


In [None]:
#create "tokens"
source_tokens = []
for sentence in dataset[:,0]:
  source_tokens.append(sentence.split(' '))
print(source_tokens[11000])
target_tokens = []
for sentence in dataset[:,1]:
  target_tokens.append(sentence.split(' '))
print(target_tokens[11000])

['i', 'bumped', 'my', 'knee']
['me', 'di', 'un', 'golpe', 'en', 'la', 'rodilla']


In [None]:
def build_token_dict(token_list):
  token_dict = {
      '<PAD>':0,
      '<START>':1,
      '<END>':2,
  }
  for tokens in token_list:
    for token in tokens:
      if token not in token_dict:
        token_dict[token] = len(token_dict)
  return token_dict

In [None]:
#put a numerical value in each word
source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v:k for k,v in target_token_dict.items()} #put a value of word to each number

print(source_token_dict)
print(target_token_dict)
print(target_token_dict_inv)

In [None]:
#add START, END and PAD in each sentences from training set
encoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>'] for tokens in target_tokens]

#find the longest sentence between tokens
source_max_len = max(map(len, encoder_tokens))
target_max_len = max(map(len, decoder_tokens))

#put 0 in each sentence more short than the longest sentence 
encoder_tokens = [tokens + ['<PAD>']*(source_max_len-len(tokens)) for tokens in encoder_tokens]
decoder_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in decoder_tokens]
output_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in output_tokens]

In [None]:
print(encoder_tokens[11000])

['<START>', 'i', 'bumped', 'my', 'knee', '<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [None]:
encoder_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encoder_tokens]
decoder_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decoder_tokens]
output_decoded = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

print(encoder_input[11000])

[1, 16, 2488, 484, 2489, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
#create transformers net
model = get_model(
    token_num = max(len(source_token_dict), len(target_token_dict)), #longitud maxima de tokens, toma el maximo entre el dict en ingles y español
    embed_dim = 32, #cantidad de elemento del vector de embeding de entrada
    encoder_num = 2, #cantidad de codificadores,  recomendado 6
    decoder_num =2, #cantidad de decodificadores, recomendado 6
    head_num = 4, #cantidad de bloques atencionales, recomendado 8
    hidden_dim = 128, #cantidad de neuronas de la capa oculta
    dropout_rate = 0.05,
    use_same_embed = False, #durante el entrenamiento el modelo representara las frases en ingles y español de manera diferente

)

model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder-Input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Encoder-Token-Embedding (Embedd [(None, None, 32), ( 808608      Encoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Embedding (TrigPosEmbed (None, None, 32)     0           Encoder-Token-Embedding[0][0]    
__________________________________________________________________________________________________
Encoder-1-MultiHeadSelfAttentio (None, None, 32)     4224        Encoder-Embedding[0][0]          
_______________________________________________________________________________________

In [None]:
# Fit
x = [np.array(encoder_input), np.array(decoder_input)]
y = np.array(output_decoded)

model.fit(x,y, epochs=15, batch_size=32)

In [None]:
def translate(sentence):
  sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]#tokenizamos la oración
  tr_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in sentence_tokens][0]#convertimos los tokens en una representación numerica de los mismos
  decoded = decode(
      model,
      tr_input,
      start_token = target_token_dict['<START>'], #token de inicio de la oración
      end_token = target_token_dict['<END>'], #token de finalización
      pad_token = target_token_dict['<PAD>'] #token de pad
  )

  print('Frase original: {}'.format(sentence))
  print('Traducción: {}'.format(' '.join(map(lambda x: target_token_dict_inv[x], decode[1:-1]))))# convierte los numeros en palabras

In [None]:
translate('Red car')