In [None]:
import sys
sys.path.append('/thesis')

Mounted at /content/drive


In [None]:
import json
import pickle
# import logging
# import time

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from helper.character_encoder import DictionaryCharacterEncoder
from helper.prediction import predict_sequence

from implementation.seq2seq.transformer.utils import masked_loss, masked_accuracy
from implementation.seq2seq.transformer.keras_nlp import prepare_batches
from implementation.seq2seq.transformer.keras_nlp import construct_model_w_teacher_forcing

import tensorflow as tf
import keras_nlp

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
# Input file paths
jrc_file = '/thesis/data/jrc_person_pairs.csv'
wikidata_file = '/thesis/data/wikidata_person_pairs.csv'

model_serialization_path = '/thesis/models/new'

## Loading & Preprocessing

In [None]:
# Preprocessing
MAX_SEQ_LENGTH = 30  # 40
NUM_SAMPLES = 550000
VALIDATION_SPLIT = 0.25
RANDOM_STATE = 1010
BATCH_SIZE = 64

# Model
EMBEDDING_DIM = 64
INTERMEDIATE_DIM = 512
NUM_ENCODER_HEADS = 8
NUM_DECODER_HEADS = 8
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 4

DROPOUT = 0.2

# Training
EPOCHS = 12 # 40
CHECKPOINT_FREQ = 2

In [None]:
pairs_df = pd.read_csv(jrc_file, sep='|', encoding='utf-8')[['input', 'target']]
pairs_df = pairs_df[(pairs_df['input'].str.len() <= MAX_SEQ_LENGTH) & (pairs_df['target'].str.len() <= MAX_SEQ_LENGTH)]
print('Number of JRC pairs:', len(pairs_df))
pairs_df2 = pd.read_csv(wikidata_file, sep='|', encoding='utf-8')[['input', 'target']]
pairs_df2 = pairs_df2[(pairs_df2['input'].str.len() <= MAX_SEQ_LENGTH) & (pairs_df2['target'].str.len() <= MAX_SEQ_LENGTH)]
print('Number of Wikidata pairs:', len(pairs_df2), '\n')

pairs_df = pd.concat([pairs_df, pairs_df2]).sample(frac=1, random_state=RANDOM_STATE)
pairs_df

Number of JRC pairs: 131636
Number of Wikidata pairs: 434230 



Unnamed: 0,input,target
272102,katharine mccook knox,katherine mccook knox
431611,meri aroni,mary aroni
61867,alejandro foxley,alejandre foxley
424660,niche perez,limber perez
244085,jindrich wankel,heinrich wankel
...,...,...
440185,vitaly lisakovich,vital' lisakovic
122954,adam vojtech,adama vojtecha
27211,david petraeus,david petreaeus
164783,ethel standiford-mehling,ethel standiford-mehlingan


In [None]:
dce = DictionaryCharacterEncoder(max_seq_length=MAX_SEQ_LENGTH+2)
train_batches, val_batches = prepare_batches(pairs_df, dce, NUM_SAMPLES, VALIDATION_SPLIT, BATCH_SIZE, RANDOM_STATE)

In [None]:
for inputs, targets in train_batches.take(1):
    print(f'Encoder inputs shape: {inputs[0].shape}')
    print(f'Decoder inputs shape: {inputs[1].shape}')
    print(f'Targets shape: {targets.shape}')

Encoder inputs shape: (64, 30)
Decoder inputs shape: (64, 31)
Targets shape: (64, 31)


## Model

In [None]:
transformer = construct_model_w_teacher_forcing(
    num_encoder_layers=NUM_ENCODER_LAYERS, 
    num_decoder_layers=NUM_DECODER_LAYERS, 
    unique_tokens=len(dce.charset),
    max_seq_length=MAX_SEQ_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    intermediate_dim=INTERMEDIATE_DIM,
    encoder_heads=NUM_ENCODER_HEADS,
    decoder_heads=NUM_DECODER_HEADS,
    dropout=DROPOUT
)

In [None]:
transformer.summary()
# transformer.compile("rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
transformer.compile(
    loss=masked_loss,
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[masked_accuracy])

Model: "transformer_w_tf"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 32, 64)       2176        ['encoder_inputs[0][0]']         
                                                                                                  
 position_embedding (PositionEm  (None, 32, 64)      2048        ['embedding[0][0]']              
 bedding)                                                                                         
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 32, 64)      0           ['embedding[0][0]'

## Training

In [None]:
steps_per_epoch = len(train_batches)

checkpoint_path = f'{model_serialization_path}/checkpoints/' + 'weights-{epoch:03d}'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
                checkpoint_path, 
                monitor='val_masked_accuracy', 
                save_weights_only=True,
                save_freq=int(steps_per_epoch * CHECKPOINT_FREQ), 
                verbose=1)

In [None]:
history = transformer.fit(train_batches, epochs=EPOCHS, validation_data=val_batches, callbacks=[checkpoint])

Epoch 1/12
Epoch 2/12
Epoch 2: saving model to /content/drive/MyDrive/HDa/Masterarbeit/Models/transf_mixed_e4/checkpoints/weights-002
Epoch 3/12
Epoch 4/12
Epoch 4: saving model to /content/drive/MyDrive/HDa/Masterarbeit/Models/transf_mixed_e4/checkpoints/weights-004
Epoch 5/12
Epoch 6/12
Epoch 6: saving model to /content/drive/MyDrive/HDa/Masterarbeit/Models/transf_mixed_e4/checkpoints/weights-006
Epoch 7/12
Epoch 8/12
Epoch 8: saving model to /content/drive/MyDrive/HDa/Masterarbeit/Models/transf_mixed_e4/checkpoints/weights-008
Epoch 9/12
Epoch 10/12
Epoch 10: saving model to /content/drive/MyDrive/HDa/Masterarbeit/Models/transf_mixed_e4/checkpoints/weights-010
Epoch 11/12
Epoch 12/12
Epoch 12: saving model to /content/drive/MyDrive/HDa/Masterarbeit/Models/transf_mixed_e4/checkpoints/weights-012


In [None]:
transformer.save(f'{model_serialization_path}/')



In [None]:
with open(model_serialization_path + '/train_history.p', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

train_config = {
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS,
    'num_samples': NUM_SAMPLES,
    'max_seq_length': MAX_SEQ_LENGTH,
    'random_state': RANDOM_STATE,
    'validation_split': VALIDATION_SPLIT,
    'encoder_layers': NUM_ENCODER_LAYERS,
    'decoder_layers': NUM_DECODER_LAYERS,
    'encoder_heads': NUM_ENCODER_HEADS,
    'decoder_heads': NUM_DECODER_HEADS,
    'dropout': DROPOUT,
    'embedding_dim': EMBEDDING_DIM,
    'intermediate_dim': INTERMEDIATE_DIM
}

with open(model_serialization_path + '/config.p', 'wb') as file_pi:
    pickle.dump(train_config, file_pi)

In [None]:
transformer = tf.keras.models.load_model(f'{model_serialization_path}/', custom_objects={'masked_loss': masked_loss, 'masked_accuracy': masked_accuracy})

## Prediction

In [None]:
def decode_sequences(input_sentence):
     # Tokenize the encoder input.
    encoder_input_tokens = tf.keras.utils.pad_sequences(
        dce.to_ids([input_sentence], insert_markers=True), 
        padding='post', 
        maxlen=MAX_SEQ_LENGTH+2
    )

    # Define a function that outputs the next token's probability given the
    # input sequence.
    def token_probability_fn(decoder_input_tokens):
        return transformer([encoder_input_tokens, decoder_input_tokens])[:, -1, :]


    prompt = tf.fill((1, 1), dce.char_index['\t'])
    generated_tokens = keras_nlp.utils.top_p_search(
        token_probability_fn,
        prompt,
        p=0.1,
        max_length=MAX_SEQ_LENGTH,
        end_token_id=dce.char_index['\n'],
    )
    generated_sentences = ''.join([dce.inverse_char_index[tkn] for tkn in generated_tokens.numpy()[0]])
    return generated_sentences.strip()

names = [
    'samuel meyer',
    'dmitry medvedev',
    'paulo ricardo',
    'zouheir al qaissi',
    'tarek al bichri',
    'thorsten brotzmann'
]

for s in names:
    translated = decode_sequences(s)
    print(s)
    print(translated)
    print()


samuel meyer
samuel meyerova

dmitry medvedev
dmitrij medvedev

paulo ricardo
paulo ricardova

zouheir al qaissi
zuheir al qaissi

tarek al bichri
tarek al bichry

thorsten brotzmann
thorsten brotzman

