In [10]:
import sys
sys.path.append('/thesis')

In [11]:
import json
import pickle
# import logging
# import time

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from helper.character_encoder import DictionaryCharacterEncoder
from helper.prediction import predict_sequence

from implementation.seq2seq.transformer.utils import masked_loss, masked_accuracy
from implementation.seq2seq.transformer.keras_nlp import prepare_batches
from implementation.seq2seq.transformer.keras_nlp import construct_model_w_teacher_forcing

import tensorflow as tf
import keras_nlp

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [12]:
# Input file paths
# jrc_file = '/thesis/data/jrc_person_pairs.csv'
# wikidata_file = '/thesis/data/wikidata_person_to_en_norm.csv'
wikidata_file = '/thesis/data/experiments/org_train_val_e1.csv'

model_serialization_path = '/thesis/models/e1_mixed_org'

## Loading & Preprocessing

In [13]:
# Preprocessing
MAX_SEQ_LENGTH = 60
NUM_SAMPLES = 500000 # 350000
VALIDATION_SPLIT = 0.25
RANDOM_STATE = 1010
BATCH_SIZE = 128

# Model
EMBEDDING_DIM = 128
INTERMEDIATE_DIM = 1024
NUM_ENCODER_HEADS = 4
NUM_DECODER_HEADS = 4
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 6

DROPOUT = 0.2

# Training
EPOCHS = 60
CHECKPOINT_FREQ = 10

In [14]:
# pairs_df = pd.read_csv(jrc_file, sep='|', encoding='utf-8')[['input', 'target']]
# pairs_df = pairs_df[(pairs_df['input'].str.len() <= MAX_SEQ_LENGTH) & (pairs_df['target'].str.len() <= MAX_SEQ_LENGTH)]
# print('Number of JRC pairs:', len(pairs_df))
# pairs_df2 = pd.read_csv(wikidata_file, sep='|', encoding='utf-8')[['input', 'target']]
# pairs_df2 = pairs_df2[(pairs_df2['input'].str.len() <= MAX_SEQ_LENGTH) & (pairs_df2['target'].str.len() <= MAX_SEQ_LENGTH)]
# print('Number of Wikidata pairs:', len(pairs_df2), '\n')
# pairs_df = pd.concat([pairs_df, pairs_df2]).sample(frac=1, random_state=RANDOM_STATE)

pairs_df = pd.read_csv(wikidata_file, sep='|', encoding='utf-8')[['input', 'target']]
pairs_df = pairs_df[(pairs_df['input'].str.len() <= MAX_SEQ_LENGTH) & (pairs_df['target'].str.len() <= MAX_SEQ_LENGTH)]
pairs_df

Unnamed: 0,input,target
0,sabmiller,sabmiller plc
1,sabmiller plc,sabmiller
2,international astronomical union,den internasjonale astronomiske union
3,international astronomical union,union astronomique internationale
4,international astronomical union,internationale astronomische union
...,...,...
523583,mundial 2010,mistrovstvi sveta ve fotbale 2010
523584,sveuciliste u sydneyu,universidade de sydney
523585,palestiense staot,palastinensische autonomiebehorde
523586,europaischem stabilitatsmechanismus,europaischen stabilitatsmechanismus


In [15]:
dce = DictionaryCharacterEncoder(max_seq_length=MAX_SEQ_LENGTH+2, charset='extended')
train_batches, val_batches = prepare_batches(pairs_df, dce, NUM_SAMPLES, VALIDATION_SPLIT, BATCH_SIZE, RANDOM_STATE)

2023-08-31 15:37:57.769877: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-31 15:37:57.770226: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-31 15:37:57.770457: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-31 15:37:58.310386: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-31 15:37:58.310676: I tensorflow/compile

In [16]:
for inputs, targets in train_batches.take(1):
    print(f'Encoder inputs shape: {inputs[0].shape}')
    print(f'Decoder inputs shape: {inputs[1].shape}')
    print(f'Targets shape: {targets.shape}')

Encoder inputs shape: (128, 57)
Decoder inputs shape: (128, 54)
Targets shape: (128, 54)


2023-08-31 15:38:16.452986: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype variant and shape [375000]
	 [[{{node Placeholder/_1}}]]
2023-08-31 15:38:16.453210: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype variant and shape [375000]
	 [[{{node Placeholder/_1}}]]


## Model

In [17]:
transformer = construct_model_w_teacher_forcing(
    num_encoder_layers=NUM_ENCODER_LAYERS, 
    num_decoder_layers=NUM_DECODER_LAYERS, 
    unique_tokens=len(dce.charset),
    max_seq_length=MAX_SEQ_LENGTH,
    embedding_dim=EMBEDDING_DIM,
    intermediate_dim=INTERMEDIATE_DIM,
    encoder_heads=NUM_ENCODER_HEADS,
    decoder_heads=NUM_DECODER_HEADS,
    dropout=DROPOUT
)

In [18]:
transformer.summary()
# transformer.compile("rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
transformer.compile(
    loss=masked_loss,
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[masked_accuracy])

Model: "transformer_w_tf"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 62)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 62, 128)      5888        ['encoder_inputs[0][0]']         
                                                                                                  
 position_embedding (PositionEm  (None, 62, 128)     7936        ['embedding[0][0]']              
 bedding)                                                                                         
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 62, 128)     0           ['embedding[0][0]'

## Training

In [19]:
steps_per_epoch = len(train_batches)

checkpoint_path = f'{model_serialization_path}/checkpoints/' + 'weights-{epoch:03d}'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
                checkpoint_path, 
                monitor='val_masked_accuracy', 
                save_weights_only=True,
                save_freq=int(steps_per_epoch * CHECKPOINT_FREQ), 
                verbose=1)

es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_masked_accuracy', patience=3, min_delta=0.0015, mode='max', restore_best_weights=True)

In [20]:
history = transformer.fit(train_batches, epochs=EPOCHS, validation_data=val_batches, callbacks=[checkpoint, es_callback])

Epoch 1/60


2023-08-31 15:38:18.253654: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype variant and shape [375000]
	 [[{{node Placeholder/_1}}]]
2023-08-31 15:38:18.253869: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [375000]
	 [[{{node Placeholder/_0}}]]
2023-08-31 15:38:28.571841: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-31 15:38:28.803396: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x8c2ba00 initialized for platfor



2023-08-31 15:47:03.501534: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [125000]
	 [[{{node Placeholder/_0}}]]
2023-08-31 15:47:03.501799: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [125000]
	 [[{{node Placeholder/_0}}]]


Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 10: saving model to /thesis/models/e1_mixed_org/checkpoints/weights-010
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 20: saving model to /thesis/models/e1_mixed_org/checkpoints/weights-020
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 30: saving model to /thesis/models/e1_mixed_org/checkpoints/weights-030
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60


In [21]:
transformer.save(f'{model_serialization_path}/')

2023-08-31 20:57:13.084750: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,128]
	 [[{{node inputs}}]]
2023-08-31 20:57:13.095902: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'decoder_sequence' with dtype float and shape [?,?,128]
	 [[{{node decoder_sequence}}]]
2023-08-31 20:57:13.107214: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'decoder_sequence' with dtype float and shape [?,?,128]
	 

INFO:tensorflow:Assets written to: /thesis/models/e1_mixed_org/assets


INFO:tensorflow:Assets written to: /thesis/models/e1_mixed_org/assets


In [22]:
with open(model_serialization_path + '/train_history.p', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

train_config = {
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS,
    'num_samples': NUM_SAMPLES,
    'max_seq_length': MAX_SEQ_LENGTH,
    'random_state': RANDOM_STATE,
    'validation_split': VALIDATION_SPLIT,
    'encoder_layers': NUM_ENCODER_LAYERS,
    'decoder_layers': NUM_DECODER_LAYERS,
    'encoder_heads': NUM_ENCODER_HEADS,
    'decoder_heads': NUM_DECODER_HEADS,
    'dropout': DROPOUT,
    'embedding_dim': EMBEDDING_DIM,
    'intermediate_dim': INTERMEDIATE_DIM
}

with open(model_serialization_path + '/config.p', 'wb') as file_pi:
    pickle.dump(train_config, file_pi)

In [23]:
transformer = tf.keras.models.load_model(f'{model_serialization_path}/', custom_objects={'masked_loss': masked_loss, 'masked_accuracy': masked_accuracy})

## Prediction

In [24]:
def decode_sequences(input_sentence):
     # Tokenize the encoder input.
    encoder_input_tokens = tf.keras.utils.pad_sequences(
        dce.to_ids([input_sentence], insert_markers=True), 
        padding='post', 
        maxlen=MAX_SEQ_LENGTH+2
    )

    # Define a function that outputs the next token's probability given the
    # input sequence.
    def token_probability_fn(decoder_input_tokens):
        return transformer([encoder_input_tokens, decoder_input_tokens])[:, -1, :]


    prompt = tf.fill((1, 1), dce.char_index['\t'])
    generated_tokens = keras_nlp.utils.top_p_search(
        token_probability_fn,
        prompt,
        p=0.1,
        max_length=MAX_SEQ_LENGTH,
        end_token_id=dce.char_index['\n'],
    )
    generated_sentences = ''.join([dce.inverse_char_index[tkn] for tkn in generated_tokens.numpy()[0]])
    return generated_sentences.strip()

names = [
    'samuel meyer',
    'dmitry medvedev',
    'paulo ricardo',
    'zouheir al qaissi',
    'tarek al bichri',
    'thorsten brotzmann'
]

for s in names:
    translated = decode_sequences(s)
    print(s)
    print(translated)
    print()


samuel meyer
samuel meyer group

dmitry medvedev
dmitry medvedev

paulo ricardo
paulo ricardo

zouheir al qaissi
al qaissi

tarek al bichri
tarek al bichri

thorsten brotzmann
thorsten brotzmann

