# Attention is All You Need

In [2]:
import subprocess
import sys


IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    subprocess.run("pip install datasets nltk gensim einops evaluate unidecode", shell=True)
    subprocess.run("python -m nltk.downloader punkt", shell=True)

In [3]:
import math
import os
import pickle

import torch
import nltk
import einops
import evaluate

from unidecode import unidecode
from datasets import load_dataset, load_from_disk

nltk.download('wordnet')
nltk.download('omw-1.4')

bleu = evaluate.load("bleu")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

cuda


# Data Preprocessing

In [7]:
path_to_dataset_dir = "/content"

In [None]:
wmt14 = load_dataset("wmt14", "de-en")

Downloading builder script:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/8.72k [00:00<?, ?B/s]

Downloading and preparing dataset wmt14/de-en (download: 1.58 GiB, generated: 1.27 GiB, post-processed: Unknown size, total: 2.85 GiB) to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/6aa64c5c4f2c1c217718c6d6266aad92d1229e761c57379c53752b8c0e55c93b...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Dataset wmt14 downloaded and prepared to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/6aa64c5c4f2c1c217718c6d6266aad92d1229e761c57379c53752b8c0e55c93b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Let's leave only first 500000 from the train dataset for computational simplicity:

In [None]:
wmt14["train"] = wmt14["train"].select(range(500000))

In [None]:
wmt14

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 500000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})

Create tokenizers:

In [17]:
tokenizer = nltk.WordPunctTokenizer()
lemmatizer = nltk.WordNetLemmatizer()

def tokenize_pipeline(sentence):
    sentence = sentence.lower()
    tokens = tokenizer.tokenize(sentence)
    return [lemmatizer.lemmatize(token) for token in tokens]
    
def tokenize_sentence(example):
  return {"en_tokens": tokenize_pipeline(example["translation"]["en"]), "de_tokens": tokenize_pipeline(unidecode(example["translation"]["de"]))}

In [None]:
wmt14 = wmt14.map(tokenize_sentence)

  0%|          | 0/500000 [00:00<?, ?ex/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

  0%|          | 0/3003 [00:00<?, ?ex/s]

Filter dataset by sequence length:

In [None]:
wmt14 = wmt14.filter(lambda example: len(example['de_tokens']) <= 254 and len(example["en_tokens"]) <= 254)

  0%|          | 0/500 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
wmt14

DatasetDict({
    train: Dataset({
        features: ['translation', 'en_tokens', 'de_tokens'],
        num_rows: 499997
    })
    validation: Dataset({
        features: ['translation', 'en_tokens', 'de_tokens'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation', 'en_tokens', 'de_tokens'],
        num_rows: 3003
    })
})

In [None]:
wmt14.save_to_disk(os.path.join(path_to_dataset_dir, "wmt14_500k"))

Flattening the indices:   0%|          | 0/500 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/4 [00:00<?, ?ba/s]

In [10]:
# wmt14 = load_from_disk(os.path.join(path_to_dataset_dir, "wmt14_500k"))

Build the vocabulary:

In [11]:
PAD_token = 0
BOS_token = 1
EOS_token = 2
UNK_token = 3

class Dictionary:
    def __init__(self, lang, freq_threshold=5):
        self.lang = lang
        self.freq_threshold = freq_threshold
        self.word2index = {"<PAD>": 0, "<BOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.word2count = {}
        self.index2word = {0: "<PAD>", 1: "<BOS>", 2: "<EOS>", 3: "<UNK>"}
        self.n_words = 4

    def add_sentence(self, sentence):
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2count:
            self.word2count[word] = 1
        else:
            self.word2count[word] += 1
        
        if self.word2count[word] == self.freq_threshold:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1
    
    def __len__(self):
      return self.n_words

In [None]:
en_dictionary = Dictionary("en", freq_threshold=10)
de_dictionary = Dictionary("de", freq_threshold=20)

def create_dicts(example):
  en_dictionary.add_sentence(example["en_tokens"])
  de_dictionary.add_sentence(example["de_tokens"])

wmt14["train"].map(create_dicts)

print(f"Num of words in EN dict: {len(en_dictionary)}")
print(f"Num of words in DE dict: {len(de_dictionary)}")

  0%|          | 0/499997 [00:00<?, ?ex/s]

Num of words in EN dict: 15546
Num of words in DE dict: 20217


In [12]:
def save_object(obj, filename):
    with open(filename, "wb") as outp:  
        pickle.dump(obj, outp)

def read_object(filename):
  with open(filename, "rb") as inp:
    return pickle.load(inp)

In [None]:
save_object(en_dictionary, os.path.join(path_to_dataset_dir, "en_dictionary.pkl"))
save_object(de_dictionary, os.path.join(path_to_dataset_dir, "de_dictionary.pkl"))

In [14]:
# en_dictionary = read_object("/content/en_dictionary.pkl")
# de_dictionary = read_object("/content/de_dictionary.pkl")

# print(f"Num of words in EN dict: {len(en_dictionary)}")
# print(f"Num of words in DE dict: {len(de_dictionary)}")

Create Dataset:

In [15]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, dataset, en_dictionary=en_dictionary, de_dictionary=de_dictionary):
        self.tokenizer = tokenizer
        self.en_dictionary = en_dictionary
        self.de_dictionary = de_dictionary
        
        def convert_words_to_ids(example):
            return {"en_ids": [self.en_dictionary.word2index[token] if token in self.en_dictionary.word2index else UNK_token for token in example["en_tokens"]],
                    "de_ids": [self.de_dictionary.word2index[token] if token in self.de_dictionary.word2index else UNK_token for token in example["de_tokens"]]}
        
        dataset = dataset.map(convert_words_to_ids)
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        example = self.dataset[index]
        return torch.tensor(example["de_ids"]), torch.tensor(example["en_ids"])

In [18]:
train_dataset = TranslationDataset(tokenize_pipeline, wmt14["train"])
valid_dataset = TranslationDataset(tokenize_pipeline, wmt14["validation"])
test_dataset = TranslationDataset(tokenize_pipeline, wmt14["test"])

  0%|          | 0/499997 [00:00<?, ?ex/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

  0%|          | 0/3003 [00:00<?, ?ex/s]

Define DataLoaders:

In [19]:
def collate_fn(batch):
  # sort the batch w.r.t source length in decreasing order
  data_batch = sorted(batch, key=lambda x: - len(x[0]))

  de_batch, en_batch = [], []

  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor([BOS_token]), de_item, torch.tensor([EOS_token])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_token]), en_item, torch.tensor([EOS_token])], dim=0))
  
  de_batch = torch.nn.utils.rnn.pad_sequence(de_batch, padding_value=PAD_token, batch_first=True)
  en_batch = torch.nn.utils.rnn.pad_sequence(en_batch, padding_value=PAD_token, batch_first=True)
  
  return de_batch, en_batch

In [20]:
batch_size = 16

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
# src, trg = next(iter(train_dataloader))
# src.shape, trg.shape

# Model

In [21]:
def attention(K, V, Q, num_heads, mask=None):
    batch_size, hidden_dim = Q.size(0), Q.size(2)
    key_len, value_len, query_len = K.size(1), V.size(1), Q.size(1) 
    
    K = K.reshape(batch_size, key_len, num_heads, -1) # (batch_size, key_len, num_heads, head_dim)
    V = V.reshape(batch_size, value_len, num_heads, -1) # (batch_size, value_len, num_heads, head_dim)
    Q = Q.reshape(batch_size, query_len, num_heads, -1) # (batch_size, query_len, num_heads, head_dim)
    
    energy = torch.einsum('bqhd,bkhd->bhqk', [Q, K]) # (batch_size, num_heads, query_len, key_len)
    
    if mask is not None:
      energy = energy.masked_fill(mask == 0, -torch.inf)

    attention = torch.softmax(energy / math.sqrt(hidden_dim // num_heads), dim=3)
    result_headed = torch.einsum('bhql,blhd->bqhd', [attention, V]) # (batch_size, query_len, num_heads, head_dim)
    return result_headed.reshape(batch_size, query_len, hidden_dim)


class AttentionModule(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_heads = num_heads

        self.k_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.v_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.q_linear = torch.nn.Linear(hidden_dim, hidden_dim)

        self.out_linear = torch.nn.Linear(hidden_dim, hidden_dim)
    
    def forward(self, keys, values, query, mask):
        K = self.k_linear(keys)
        V = self.v_linear(values)
        Q = self.q_linear(query)
        attention_output = attention(K, V, Q, self.num_heads, mask)
        return self.out_linear(attention_output) + query


class MLP(torch.nn.Module):
    def __init__(self, hidden_dim: int):
        super().__init__()
        
        self.linear_0 = torch.nn.Linear(hidden_dim, 4 * hidden_dim)
        self.linear_1 = torch.nn.Linear(4 * hidden_dim, hidden_dim)
    
    def forward(self, hidden_state):
        return self.linear_1(torch.relu(self.linear_0(hidden_state))) + hidden_state

In [22]:
class EncoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()
        
        self.attention = AttentionModule(hidden_dim, num_heads)
        self.mlp = MLP(hidden_dim)

        self.norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, value, key, query, mask):
        attn_output = self.dropout(self.norm(self.attention(value, key, query, mask)))
        mlp_output = self.dropout(self.norm(self.mlp(attn_output)))
        return mlp_output

In [23]:
class Encoder(torch.nn.Module):
    def __init__(self, de_dictionary_size: int, hidden_dim: int, num_layers: int, num_heads: int, dropout: float = 0.1, max_seq_len: int = 256):
        super().__init__()
        
        self.word_embedding = torch.nn.Embedding(de_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.layers = torch.nn.ModuleList(
            [
                EncoderTransformerLayer(
                    hidden_dim,
                    num_heads,
                    dropout
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, inputs, mask):
        batch_size, seq_len = inputs.shape
        positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)
        hidden_dim = self.dropout(self.word_embedding(inputs) + self.pos_embedding(positions))

        for layer in self.layers:
          hidden_dim = layer(hidden_dim, hidden_dim, hidden_dim, mask)
        
        return hidden_dim

In [24]:
class DecoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__() 

        self.self_attention = AttentionModule(hidden_dim, num_heads) # аттенш на то, что происходит в переводе
        self.out_attention = EncoderTransformerLayer(hidden_dim, num_heads) # аттенш на то, что происходит в оригинале

        self.norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, hidden_state, encoder_layer_output, src_mask, trg_mask):
        self_attn_output = self.dropout(self.norm(self.self_attention(hidden_state, hidden_state, hidden_state, trg_mask)))
        output = self.out_attention(encoder_layer_output, encoder_layer_output, self_attn_output, src_mask)
        return output

In [25]:
class Decoder(torch.nn.Module):
    def __init__(self, en_dictionary_size: int, hidden_dim: int, num_layers: int, num_heads: int, dropout: float = 0.1, max_seq_len: int = 256):
        super().__init__()
        
        self.word_embedding = torch.nn.Embedding(en_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.layers = torch.nn.ModuleList(
            [
                DecoderTransformerLayer(hidden_dim, num_heads)
                for _ in range(num_layers)
            ]
        )
        
        self.lm_head = torch.nn.Linear(hidden_dim, en_dictionary_size)
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, inputs, encoder_output, src_mask, trg_mask):
        batch_size, seq_len = inputs.shape
        positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)
        inputs = self.dropout(self.word_embedding(inputs) + self.pos_embedding(positions))
        
        for layer in self.layers:
          inputs = layer(inputs, encoder_output, src_mask, trg_mask)
        
        return self.lm_head(inputs)

In [26]:
class TranslationModel(torch.nn.Module):
    def __init__(self, de_dictionary_size: int, en_dictionary_size: int, hidden_dim: int = 512, num_layers: int = 6, num_heads: int = 8, dropout: float = 0.1):
        super().__init__()
        
        self.encoder = Encoder(de_dictionary_size, hidden_dim, num_layers, num_heads, dropout)
        self.decoder = Decoder(en_dictionary_size, hidden_dim, num_layers, num_heads, dropout)
        
    def make_src_mask(self, src):
      src_pad_mask = (src != PAD_token).unsqueeze(1).unsqueeze(2)
      return src_pad_mask

    def make_trg_mask(self, trg):
      trg_pad_mask = (trg != PAD_token).unsqueeze(1).unsqueeze(2)

      batch_size, trg_len = trg.shape
      trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            batch_size, 1, trg_len, trg_len
        ).bool().to(device)
      trg_mask = trg_pad_mask & trg_sub_mask
      return trg_mask
    
    def forward(self, inputs):
        src_ids, trg_ids = inputs
        src_mask = self.make_src_mask(src_ids).to(device)
        trg_mask = self.make_trg_mask(trg_ids).to(device)
        encoder_output = self.encoder(src_ids, src_mask)
        decoder_output = self.decoder(trg_ids, encoder_output, src_mask, trg_mask)
        return decoder_output

In [None]:
# src = torch.tensor([[1, 5, 6, 4, 3, 8, 5, 2, 0]], device=device)
# trg = torch.tensor([[1]], device=device)
# print(f"src shape: {src.shape}, trg shape: {trg.shape}")

# src_vocab_size = 20
# trg_vocab_size = 20

# toy_model = TranslationModel(src_vocab_size, trg_vocab_size).to(device)
# out = toy_model((src, trg))
# print(out.shape)

# Training

In [27]:
model = TranslationModel(len(de_dictionary), len(en_dictionary)).to(device)
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model has {num_parameters} trainable parameters")

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_token)

Model has 70674106 trainable parameters


In [30]:
def train_epoch(model, optimizer, criterion, dataloader):
  model.train()

  running_loss = 0.0

  for src, trg in dataloader:

    src = src.to(device)
    trg = trg.to(device) # (batch_size, seq_len)

    optimizer.zero_grad()

    output = model((src, trg[:, :-1])) # (batch_size, seq_len, en_vocab_len)

    trg = trg[:, 1:].contiguous().view(-1) # (batch_size * (seq_len - 1))
    en_vocab_len = output.size(-1)
    output = output.view(-1, en_vocab_len) # (batch_size * (seq_len - 1), en_vocab_len)
    
    loss = criterion(output, trg)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
  
  epoch_loss = running_loss / len(dataloader.dataset)

  return epoch_loss

def eval_epoch(model, criterion, dataloader):
  model.eval()

  running_loss = 0.0

  with torch.no_grad():
    for src, trg in dataloader:
      src = src.to(device)
      trg = trg.to(device)

      output = model((src, trg[:, :-1]))

      trg = trg[:, 1:].contiguous().view(-1) 
      en_vocab_len = output.size(-1)
      output = output.view(-1, en_vocab_len) 
      
      loss = criterion(output, trg)

      running_loss += loss.item()
  
  epoch_loss = running_loss / len(dataloader.dataset)

  return epoch_loss

In [28]:
model_checkpoint = "model.pt"

In [None]:
num_epochs = 10

best_valid_loss = torch.inf

for epoch in range(num_epochs):
  train_loss = train_epoch(model, optimizer, criterion, train_dataloader)

  valid_loss = eval_epoch(model, criterion, valid_dataloader)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), os.path.join(path_to_dataset_dir, model_checkpoint))
    print(f"Model is saved to {os.path.join(path_to_dataset_dir, model_checkpoint)}")

  print(f"Epoch #{epoch + 1}:")
  print(f"Training Loss: {train_loss}")
  print(f"Evaluation Loss: {valid_loss}")
  print()

Model is saved to /kaggle/working/model.pt
Epoch #1:
Training Loss: 0.23895120679426546
Evaluation Loss: 0.2453935843706131

Model is saved to /kaggle/working/model.pt
Epoch #2:
Training Loss: 0.17692864883786136
Evaluation Loss: 0.2179473903576533

Model is saved to /kaggle/working/model.pt
Epoch #3:
Training Loss: 0.15656631406147575
Evaluation Loss: 0.20528336278597514

Model is saved to /kaggle/working/model.pt
Epoch #4:
Training Loss: 0.14537321306412085
Evaluation Loss: 0.19557876733938853

Model is saved to /kaggle/working/model.pt
Epoch #5:
Training Loss: 0.1379451284970821
Evaluation Loss: 0.19105472354094188

Model is saved to /kaggle/working/model.pt
Epoch #6:
Training Loss: 0.13259402738617818
Evaluation Loss: 0.1870951584180196

Model is saved to /kaggle/working/model.pt
Epoch #7:
Training Loss: 0.1284219163497316
Evaluation Loss: 0.18381192831198376

Model is saved to /kaggle/working/model.pt
Epoch #8:
Training Loss: 0.12504319114016071
Evaluation Loss: 0.1815273265043894

In [31]:
model.load_state_dict(torch.load(os.path.join(path_to_dataset_dir, model_checkpoint)))
test_loss = eval_epoch(model, criterion, test_dataloader)
print(f"Test Loss: {test_loss}")

Test Loss: 0.19202897697935253


# Inference

In [32]:
def decode(src, model, max_len=100):
  model.eval()
  src = tokenize_pipeline(unidecode(src))
  src = ["<BOS>"] + src + ["<EOS>"]
  src_tensor = torch.tensor([de_dictionary.word2index[token] if token in de_dictionary.word2index else UNK_token for token in src]).unsqueeze(0).to(device)
  src_mask = model.make_src_mask(src_tensor).to(device)

  with torch.no_grad():
    encoded_src = model.encoder(src_tensor, src_mask)
      
  trg_ids = [1] # BOS token
  trg_tokens = []
  while len(trg_tokens) <= max_len:
    trg_tensor = torch.tensor(trg_ids).unsqueeze(0).to(device)
    trg_mask = model.make_trg_mask(trg_tensor)
        
    with torch.no_grad():
      predictions = model.decoder(trg_tensor, encoded_src, src_mask, trg_mask)
      last_pred_id = predictions[:, -1, :].argmax(-1).item()
        
      if last_pred_id == EOS_token:
        break
        
      trg_ids.append(last_pred_id)
      trg_tokens.append(en_dictionary.index2word[last_pred_id])
      
  return " ".join(trg_tokens)

In [None]:
src = "Guten Morgen!"
trg = decode(src, model)
print(trg)

good morning !


# Result
BLEU score on test set:

In [None]:
references = [[" ".join(reference)] for reference in wmt14["test"]["en_tokens"]]
predictions = [decode(example["de"], model) for example in wmt14["test"]["translation"]]
test_bleu = bleu.compute(predictions=predictions, references=references)
print(test_bleu)

{'bleu': 0.08580581099395075, 'precisions': [0.3068673290473407, 0.12017219061380535, 0.05509228604535673, 0.026682184116679834], 'brevity_penalty': 1.0, 'length_ratio': 1.466634284326471, 'translation_length': 102660, 'reference_length': 69997}


In [36]:
for example in wmt14["test"]["translation"][:10]:
  input = example["de"]
  prediction = decode(input, model)
  ground_truth = example["en"]

  print(f'{"Input:":15s}: {input}')
  print(f'{"Prediction":15s}: {prediction}')
  print(f'{"Ground truth":15s}: {ground_truth}')
  print()

Input:         : Gutach: Noch mehr Sicherheit für Fußgänger
Prediction     : <UNK> : more safety for pedestrian
Ground truth   : Gutach: Increased safety for pedestrians

Input:         : Sie stehen keine 100 Meter voneinander entfernt: Am Dienstag ist in Gutach die neue B 33-Fußgängerampel am Dorfparkplatz in Betrieb genommen worden - in Sichtweite der älteren Rathausampel.
Prediction     : they are not 100 metre apart : on tuesday , the new 1 33 - <UNK> <UNK> wa taken on the front floor - wa <UNK> in the old <UNK> .
Ground truth   : They are not even 100 metres apart: On Tuesday, the new B 33 pedestrian lights in Dorfparkplatz in Gutach became operational - within view of the existing Town Hall traffic lights.

Input:         : Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?
Prediction     : two plant are so close to the same level : intention or <UNK> ?
Ground truth   : Two sets of lights so close to one another: intentional or just a silly error?

Input:         