In [2]:
import json
import pickle
import logging
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from helper.character_encoder import DictionaryCharacterEncoder
from helper.prediction import predict_sequence

import tensorflow as tf

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Input file paths
jrc_file = '/thesis/data/jrc_person_pairs.csv'
wikidata_file = '/thesis/data/wikidata_person_pairs.csv'

model_serialization_path = '/thesis/models/transformer_e1'

## Preprocessing

In [4]:
max_sample_seq_length = 30  # 40
num_samples = 500000
epochs = 5

random_state = 1010
validation_split = 0.25

In [5]:
pairs_df = pd.read_csv(jrc_file, sep='|', encoding='utf-8')[['input', 'target']]
pairs_df = pairs_df[(pairs_df['input'].str.len() <= max_sample_seq_length) & (pairs_df['target'].str.len() <= max_sample_seq_length)]
print('Number of JRC pairs', len(pairs_df))
pairs_df2 = pd.read_csv(wikidata_file, sep='|', encoding='utf-8')[['input', 'target']]
pairs_df2 = pairs_df2[(pairs_df2['input'].str.len() <= max_sample_seq_length) & (pairs_df2['target'].str.len() <= max_sample_seq_length)]
print('Number of WikiData pairs', len(pairs_df2), '\n')

pairs_df = pd.concat([pairs_df, pairs_df2]).sample(frac=1, random_state=random_state)
pairs_df

Number of JRC pairs 131636
Number of WikiData pairs 434230 



Unnamed: 0,input,target
272102,katharine mccook knox,katherine mccook knox
431611,meri aroni,mary aroni
61867,alejandro foxley,alejandre foxley
424660,niche perez,limber perez
244085,jindrich wankel,heinrich wankel
...,...,...
440185,vitaly lisakovich,vital' lisakovic
122954,adam vojtech,adama vojtecha
27211,david petraeus,david petreaeus
164783,ethel standiford-mehling,ethel standiford-mehlingan


In [6]:
# Sample training data and retrieve the vectorized representation
train_smpl = pairs_df.sample(num_samples, random_state=random_state)
val_smpl = train_smpl.sample(frac=validation_split, random_state=random_state)
train_smpl = train_smpl.drop(val_smpl.index)

dce = DictionaryCharacterEncoder(max_seq_length=max_sample_seq_length+2)

train_input = train_smpl['input'].tolist()
train_target = train_smpl['target'].tolist()

val_input = val_smpl['input'].tolist()
val_target = val_smpl['target'].tolist()

train_input_ids = dce.to_ids(train_input, insert_markers=True)
train_target_ids = dce.to_ids(train_target, insert_markers=True)

val_input_ids = dce.to_ids(val_input, insert_markers=True)
val_target_ids = dce.to_ids(val_target, insert_markers=True)

In [7]:
train_input_tensors = tf.ragged.constant(train_input_ids)
train_target_tensors = tf.ragged.constant(train_target_ids)

val_input_tensors = tf.ragged.constant(val_input_ids)
val_target_tensors = tf.ragged.constant(val_target_ids)

train_dataset = tf.data.Dataset.from_tensor_slices((train_input_tensors, train_target_tensors))
val_dataset = tf.data.Dataset.from_tensor_slices((val_input_tensors, val_target_tensors))

2023-05-02 10:13:53.019421: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-02 10:13:53.019701: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-02 10:13:53.019868: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-02 10:13:53.647353: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-02 10:13:53.647576: I tensorflow/compile

In [8]:
def prepare_batch_char(input, target):
    input = input[:, :dce.max_seq_length]
    input = input.to_tensor()  # Convert to 0-padded dense Tensor

    target = target[:, :(dce.max_seq_length+1)]
    target_inputs = target[:, :-1].to_tensor()  # Drop the [END] tokens
    target_labels = target[:, 1:].to_tensor()   # Drop the [START] tokens

    return (input, target_inputs), target_labels

In [9]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [10]:
def make_batches_char(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch_char, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

## Test the Dataset 

In [11]:
train_dataset = train_dataset.prefetch(BUFFER_SIZE)
val_dataset = val_dataset.prefetch(BATCH_SIZE)

In [12]:
# Create training and validation set batches.
train_batches = make_batches_char(train_dataset)
val_batches = make_batches_char(val_dataset)

### The embedding and positional encoding layer

In [13]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
  
  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

In [14]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


### The base attention layer

In [15]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

### The cross attention layer

In [16]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)
   
    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

### The global self attention layer

In [17]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

### The causal self attention layer

In [18]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

### The feed forward network

In [19]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

### The encoder layer

In [20]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

### The encoder

In [21]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
    
    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

### The decoder layer

In [22]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)
    
    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

### The decoder

In [23]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

## The Transformer

In [24]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_enc_layers, num_dec_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_enc_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_dec_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

### Hyperparameters

To keep this example small and relatively fast, the number of layers (`num_layers`), the dimensionality of the embeddings (`d_model`), and the internal dimensionality of the `FeedForward` layer (`dff`) have been reduced.

The base model described in the original Transformer paper used `num_layers=6`, `d_model=512`, and `dff=2048`.

The number of self-attention heads remains the same (`num_heads=8`).


In [25]:
num_enc_layers = 2
num_dec_layers = 4
d_model = 32
dff = 512
num_heads = 8  # 16
dropout_rate = 0.1

In [26]:
transformer = Transformer(
    num_enc_layers=num_enc_layers,
    num_dec_layers=num_dec_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=len(dce.charset),
    target_vocab_size=len(dce.charset),
    dropout_rate=dropout_rate)

## Training

In [27]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [28]:
transformer.compile(
    loss=masked_loss,
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[masked_accuracy])

# transformer.compile(
#    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#    optimizer=tf.keras.optimizers.Adam(),
#    metrics=["accuracy"])

# loss='categorical_crossentropy', metrics=["accuracy"]

In [29]:
history = transformer.fit(train_batches,
                epochs=epochs,
                validation_data=val_batches)

Epoch 1/5


2023-05-02 10:14:05.436027: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype variant and shape [337347]
	 [[{{node Placeholder/_1}}]]
2023-05-02 10:14:05.436249: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [337347]
	 [[{{node Placeholder/_0}}]]
2023-05-02 10:14:13.082852: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-05-02 10:14:13.264351: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-



2023-05-02 10:17:10.152149: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype variant and shape [125000]
	 [[{{node Placeholder/_1}}]]
2023-05-02 10:17:10.152421: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype variant and shape [125000]
	 [[{{node Placeholder/_1}}]]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
transformer.save_weights(f'{model_serialization_path}/weights')
with open(model_serialization_path + '/train_history.p', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

train_config = {
    'batch_size': BATCH_SIZE,
    'epochs': epochs,
    'num_samples': num_samples,
    'random_state': random_state,
    'validation_split': validation_split,
    'encoder_layers': num_enc_layers,
    'decoder_layers': num_dec_layers,
    'attention_heads': num_heads,
    'dropout': dropout_rate,
    'd_model': d_model
}

# dff = 512

with open(model_serialization_path + '/config.p', 'wb') as file_pi:
    pickle.dump(train_config, file_pi)

## Run inference

In [31]:
class Translator(tf.Module):
  def __init__(self, transformer):
    self.transformer = transformer

  def __call__(self, sentence):
    sentence = tf.convert_to_tensor(dce.to_ids([sentence], insert_markers=True))

    encoder_input = sentence

    # As the output language is English, initialize the output with the
    # English `[START]` token.
    start_end = tf.convert_to_tensor([dce.char_index['\t'], dce.char_index['\t']], dtype=tf.int64)
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_sample_seq_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    # text = tokenizers.en.detokenize(output)[0]  # Shape: `()`.
    # tokens = tokenizers.en.lookup(output)[0]

    return output
    # return text, tokens

In [32]:
translator = Translator(transformer)

In [33]:
names = [
    'samuel meyer',
    'dmitry medvedev',
    'paulo ricardo',
    'zouheir al qaissi',
    'tarek al bichri',
    'thorsten brotzmann'
]

for name in names:
    print(name)
    output = translator(name)
    for pred_id in output.numpy()[0]:
        print(dce.inverse_char_index[pred_id] if pred_id != 1 else ' ', end='')
    print()

samuel meyer
	esamuel meyer                 
dmitry medvedev
	imitri medvedev               
paulo ricardo
	olicardo paulo                
zouheir al qaissi
	al qaissi                     
tarek al bichri
	al bichri                     
thorsten brotzmann
	olthon brotzmann              


In [34]:
transformer = Transformer(
    num_enc_layers=num_enc_layers,
    num_dec_layers=num_dec_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=len(dce.charset),
    target_vocab_size=len(dce.charset),
    dropout_rate=dropout_rate)

transformer.load_weights(f'{model_serialization_path}/weights')


translator = Translator(transformer)

names = [
    'samuel meyer',
    'dmitry medvedev',
    'paulo ricardo',
    'zouheir al qaissi',
    'tarek al bichri',
    'thorsten brotzmann'
]

for name in names:
    print(name)
    output = translator(name)
    for pred_id in output.numpy()[0]:
        print(dce.inverse_char_index[pred_id] if pred_id != 1 else ' ', end='')
    print()

samuel meyer
	esamuel meyer                 
dmitry medvedev
	imitri medvedev               
paulo ricardo
	olicardo paulo                
zouheir al qaissi
	al qaissi                     
tarek al bichri
	al bichri                     
thorsten brotzmann
	olthon brotzmann              
