Проект представляет собой реализацию задачи обучения трансформера на языковой паре en-de и оценки качества получившейся модели.


In [None]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 7.7 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1


In [None]:
import tensorflow as tf
from tokenizers import BertWordPieceTokenizer

from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

import os
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from string import punctuation
from collections import Counter
from IPython.display import Image
from IPython.core.display import HTML 
import matplotlib.pyplot as plt
%matplotlib inline

Загружаем данные:

In [None]:
!wget https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-train.de
!wget https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-train.en
!wget https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-test.de
!wget https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-test.en

--2022-05-22 21:13:40--  https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-train.de
Resolving data.statmt.org (data.statmt.org)... 129.215.197.184
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75998572 (72M)
Saving to: ‘opus.de-en-train.de.1’


2022-05-22 21:13:45 (16.6 MB/s) - ‘opus.de-en-train.de.1’ saved [75998572/75998572]

--2022-05-22 21:13:45--  https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-train.en
Resolving data.statmt.org (data.statmt.org)... 129.215.197.184
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 70247384 (67M)
Saving to: ‘opus.de-en-train.en.1’


2022-05-22 21:13:50 (16.2 MB/s) - ‘opus.de-en-train.en.1’ saved [70247384/70247384]

--2022-05-22 21:13:50--  https://data.statmt.org/opus-100-corpus/v1.0/supervised/de-en/opus.de-en-test.de
Reso

In [None]:
en_sents = open('opus.de-en-train.en').read().lower().splitlines()
de_sents = open('opus.de-en-train.de').read().lower().splitlines()

Воспользуемся токенизатором.

In [None]:
tokenizer_en = Tokenizer(WordPiece(), ) # WordPiece разбивает текст не на слова, а на символьные н-граммы

tokenizer_en.normalizer = normalizers.Sequence([Lowercase()]) # приводим к нижнему регистру
# записываем normalizers.Sequence([Lowercase()]) в атрибут представителя класса Tokenizer
tokenizer_en.pre_tokenizer = Whitespace() # разбиваем по пробелам
# записываем Whitespace() в атрибут представителя класса Tokenizer

trainer_en = WordPieceTrainer(
          vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]"])
tokenizer_en.train(files=["opus.de-en-train.en"], trainer=trainer_en )

tokenizer_de = Tokenizer(WordPiece(), )
tokenizer_de.normalizer = normalizers.Sequence([Lowercase()])
tokenizer_de.pre_tokenizer = Whitespace()

trainer_de = WordPieceTrainer(
          vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]"])
tokenizer_de.train(files=["opus.de-en-train.de"], trainer=trainer_de )
# тут словари для двух языков одинаковые, но они могут быть и разные. Может быть разный размер.

In [None]:
# раскоментируйте эту ячейку при обучении токенизатора
# а потом снова закоментируйте чтобы при перезапуске не перезаписать токенизаторы
# tokenizer_en.save('tokenizer_en')
# tokenizer_de.save('tokenizer_de')

In [None]:
tokenizer_en = Tokenizer.from_file("tokenizer_en")
tokenizer_de = Tokenizer.from_file("tokenizer_de")

In [None]:
def encode(text, tokenizer, target=False):
    return [tokenizer.token_to_id('[CLS]')] + tokenizer.encode(text).ids + [tokenizer.token_to_id('[SEP]')]
# Переводим текст в индексы вот таким образом. В начало добавляем токен '[CLS]', а в конец '[SEP]'.

In [None]:
# Кодируем и паддим
X_en = [encode(t, tokenizer_en) for t in en_sents]
X_de = [encode(t, tokenizer_de, target=True) for t in de_sents]

In [None]:
max_len_en = np.mean([len(x) for x in X_en])
max_len_de = np.mean([len(x) for x in X_de])
max_len_en, max_len_de

(18.466743, 18.535871)

In [None]:
# ограничимся длинной в 20 и 22 (разные чтобы показать что в seq2seq не нужна одинаковая длина)
max_len_en, max_len_de = 20, 22

In [None]:
# важно следить чтобы индекс паддинга совпадал в токенизаторе с value в pad_sequences
PAD_IDX = tokenizer_de.token_to_id('[PAD]')
PAD_IDX

3

In [None]:
tokenizer_en.token_to_id('[PAD]')

3

In [None]:
X_en = tf.keras.preprocessing.sequence.pad_sequences(
              X_en, maxlen=max_len_en, padding='post', value=tokenizer_en.token_to_id('[PAD]'))
# здесь нам важно подать, что мы падим не нулем, а индексом тега PAD в токенизаторе
# англ текст подаем целиком и ничего специально не надо делать, тк в этом корпусе все тексты коннектятся к английскому
X_de_out = tf.keras.preprocessing.sequence.pad_sequences(
              [x[1:] for x in X_de], maxlen=max_len_de-1, padding='post', 
              value=tokenizer_en.token_to_id('[PAD]'))
# второй язык нам надо подать в трансформер 2 раза, один раз как таргет, один раз как инпут для декодера.
# здесь нужно подать все сдвинутое на 1 индекс вправо
X_de_dec = tf.keras.preprocessing.sequence.pad_sequences(
              [x[:-1] for x in X_de], maxlen=max_len_de-1, 
              padding='post', value=tokenizer_de.token_to_id('[PAD]'))
# а здесь на один индекс влево

In [None]:
# миллион примеров 
X_en.shape, X_de_out.shape

((1000000, 20), (1000000, 21))

In [None]:
# Разделяем на трейн и тест
X_en_train, X_en_valid, X_de_dec_train, X_de_dec_valid, X_de_out_train, X_de_out_valid = train_test_split(X_en, 
                                                                                                      X_de_dec, 
                                                                                                      X_de_out, 
                                                                                                      test_size=0.05)

Перейдём к модели.

In [None]:
def scaled_dot_product_attention(query, key, value, mask):
    """Calculate the attention weights. """
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    if mask is not None:
        logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(logits, axis=-1)

    output = tf.matmul(attention_weights, value)

    return output

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)

        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(
            inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs[
            'value'], inputs['mask']
        batch_size = tf.shape(query)[0]

        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention = scaled_dot_product_attention(query, key, value, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))

        outputs = self.dense(concat_attention)

        return outputs

In [None]:
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, PAD_IDX), tf.float32)

    return mask[:, tf.newaxis, tf.newaxis, :]

In [None]:
def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
        position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
        i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
        d_model=d_model)
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):

        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [None]:
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': padding_mask
      })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(inputs + attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention + outputs)

    return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [None]:
def encoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            max_len,
            name="encoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(max_len, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = encoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name="encoder_layer_{}".format(i),
        )([outputs, padding_mask])

    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [None]:
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': look_ahead_mask
      })
    attention1 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention1 + inputs)

    attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1,
          'key': enc_outputs,
          'value': enc_outputs,
          'mask': padding_mask
      })
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention2 + attention1)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(outputs + attention2)

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

In [None]:
def decoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            max_len,
            name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(max_len, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = decoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name='decoder_layer_{}'.format(i),
        )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

In [None]:
def transformer(vocab_size,
                num_layers,
                units,
                d_model,
                num_heads,
                dropout,
                max_len,
                name="transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

    enc_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='enc_padding_mask')(inputs)

    look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask,
      output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)

    dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(inputs)

    enc_outputs = encoder(
      vocab_size=vocab_size[0],
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
      max_len=max_len[0],
    )(inputs=[inputs, enc_padding_mask])

    dec_outputs = decoder(
      vocab_size=vocab_size[1],
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
      max_len=max_len[1],
    )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    outputs = tf.keras.layers.Dense(units=vocab_size[1], name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

In [None]:
L  = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none',)
# поскольку у нас задача классификации, для нее стандартный лосс это CategoricalCrossentropy
# а тк мы передаем векторы индексов, то мы используем SparseCategoricalCrossentropy
def loss_function(y_true, y_pred):
    loss = L(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, PAD_IDX), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
# через warmup_steps мы определяем, до какого момента будем повышать lr ("разогрев"), чтобы потом понижать
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
# Определяем параметры модели.
tf.keras.backend.clear_session()

# small model
NUM_LAYERS = 2 # кол-во слоев
D_MODEL = 256 # размерность dense слоев внутри трансформера, т.е. сколько голов, т.е. у нас каждый блок трансформера, где мы считаем attention,
# и переходим от эмбедингов в новое представление, которое с учетом аттеншена рассчитано, вот это один блок (одна голова)
NUM_HEADS = 8 # у нас 8 таких блоков
UNITS = 512
DROPOUT = 0.1


# average model
# NUM_LAYERS = 6
# D_MODEL = 512
# NUM_HEADS = 8
# UNITS = 2048
# DROPOUT = 0.1

# инициализируем модель
# первые 2 строчки отвечают за то если есть 2 видеокарты
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = transformer(
        vocab_size=(tokenizer_en.get_vocab_size(),tokenizer_de.get_vocab_size()),
        num_layers=NUM_LAYERS,
        units=UNITS,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        dropout=DROPOUT,
        max_len=[max_len_en, max_len_de])

#     learning_rate = CustomSchedule(D_MODEL)

    optimizer = tf.keras.optimizers.Adam(
        0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    def accuracy(y_true, y_pred):
#         y_true = tf.reshape(y_true, shape=(-1, max_len_ru - 1))
        return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)


    model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])
    checkpoint = tf.keras.callbacks.ModelCheckpoint('model_ende',
                                                monitor='val_loss',
                                                verbose=1,
                                            save_weights_only=True,
                                            save_best_only=True,
                                            mode='min',
                                            save_freq='epoch')

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [None]:
model.fit((X_en_train, X_de_dec_train), X_de_out_train, 
             validation_data=((X_en_valid, X_de_dec_valid), X_de_out_valid),
             batch_size=200,
             epochs=100,
             callbacks=[checkpoint]
             )

Epoch 1/100
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 1: val_loss improved from inf to 1.94233, saving model to model_ruen
Epoch 2/100
 140/4750 [..............................] - ETA: 13:15 - los

KeyboardInterrupt: ignored

In [None]:
def translate(text):
    input_ids = encode(text.lower(), tokenizer_en)

    input_ids = tf.keras.preprocessing.sequence.pad_sequences(
                                      [input_ids], maxlen=max_len_en, padding='post',
                                      value = 3)
    
    
    output_ids = [tokenizer_de.token_to_id('[CLS]') ]
    pred = model((input_ids,                   tf.cast([output_ids], tf.int32)), training=False)

# генерируем до тега SEP
    while pred.numpy().argmax(2)[0][-1] not in [tokenizer_de.token_to_id('[SEP]'),
                                                            ]:

        if len(output_ids) > max_len_de:
            break
        output_ids.append(pred.numpy().argmax(2)[0][-1])
        pred = model((input_ids, tf.cast([output_ids], tf.int32)), training=False)

    return tokenizer_de.decode(output_ids[1:])

Посчитаем BLEU.

In [None]:
len(en_sents_test)

2000

In [None]:
import nltk

en_sents_test = open('opus.de-en-test.en').read().lower().splitlines()
de_sents_test = open('opus.de-en-test.de').read().lower().splitlines()

translations = []

for i in range(len(en_sents_test)):
  if i % 100 == 0:
    print(i)
  translations.append(translate(en_sents_test[i]))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900


In [None]:
en_sents_test[:10]

['04:26:35',
 'prähistorische archäologie im dritten reich".',
 "by clicking on 'save profile', you the user agree to these terms and conditions.",
 'i wanted to show you something first.',
 'you have suffered because of shinkichi.',
 'moodle:bg-bab: calendar: day view: friday, 25 august 1989',
 "i mean, most people, they see another person walking down the street with that big heavy bag they're carrying, and they just walk on by.",
 '(iii) the degree of substitution of different factors in pro duction.',
 'content: 32 s., 17 abb., 14 taf.',
 "so i have to withdraw as bobby's attorney."]

In [None]:
de_sents_test[:10]

['04:26:35',
 'prähistorische archäologie im dritten reich".',
 'die nutzungsbedingungen werden durch das klicken des nutzers auf "profil speichern" vereinbart.',
 'ich wollte dir erst noch etwas zeigen.',
 'du musstest wegen shinkichi leiden.',
 'moodle:bg-bab: kalender: tagesansicht: freitag, 25. august 1989',
 'ich meine, die meisten leuten sehen eine andere person die straße entlang laufen und diesen großen, schweren koffer tragen und sie laufen einfach vorbei.',
 "regressivitätsverhältnis ( ' )",
 'inhalt: 88 s., 14 abb., 6 taf., 1 beil.',
 'aus diesem grund muss ich das mandat für bobby niederlegen.']

In [None]:
translations[:10]

['04 : 26 : 35',
 'pru ##to ##mu ##ti ##on arch ##ik ##t ##ten im empir ##e ##um ##um .',
 "durch klicken sie auf ' speichern , können sie die benutzer zu diesen bedingungen und bedingungen zustimmen .",
 'ich wollte euch etwas zeigen .',
 'sie haben wegen shin ##ki ##chi .',
 'moodle : bg - bab : kalender : tagesansicht : freitag , 25 . august 1989',
 'die straße mit der schweren tü ##te , und sie können nur durch ##ziehen .',
 'iii ) die grad der un ##passen ##de faktoren in pro you - erfahrung .',
 'inhalt : 32 s ., 17 abb ., 14 taf .',
 'also muss ich das als bobby ##s anwalt zurück ##nehmen .']

In [None]:
bleus = []

for i, t in enumerate(translations):
  reference = tokenizer_de.encode(t).tokens
  hypothesis = tokenizer_de.encode(de_sents_test[i]).tokens

bleus.append(nltk.translate.bleu_score.sentence_bleu([reference], hypothesis,  ))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
(sum(bleus)/len(bleus))*100

58.202727065518864