In [None]:
import os
import pandas as pd

# set warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and classes
from XREPORT.commons.utils.preprocessing.tokenizers import BERTokenizer
from XREPORT.commons.utils.dataloader.generators import build_tensor_dataset
from XREPORT.commons.utils.dataloader.serializer import ModelSerializer
from XREPORT.commons.utils.validation import ModelValidation
from XREPORT.commons.constants import CONFIG, DATA_PATH

# Load data and model

In [None]:
# selected and load the pretrained model, then print the summary 
modelserializer = ModelSerializer()         
model, parameters = modelserializer.load_pretrained_model()
model_folder = modelserializer.loaded_model_folder
model.summary(expand_nested=True)   

# load data from csv, add paths to images 
train_file_path = os.path.join(DATA_PATH, 'XREP_train.csv') 
val_file_path = os.path.join(DATA_PATH, 'XREP_validation.csv')
train_data = pd.read_csv(train_file_path, encoding='utf-8', sep=';', low_memory=False)
validation_data = pd.read_csv(val_file_path, encoding='utf-8', sep=';', low_memory=False)

## Create generator and datasets

In [None]:
# get tokenizers and its info
tokenization = BERTokenizer(train_data, validation_data)    
tokenizer = tokenization.tokenizer

# initialize the TensorDataSet class with the generator instances
# create the tf.datasets using the previously initialized generators    
train_dataset = build_tensor_dataset(train_data)
validation_dataset = build_tensor_dataset(validation_data)
vocab_size = len(tokenizer.vocab) + 1

# Model evaluation

### 2.1 Evaluation of loss and metrics

In [None]:
validator = ModelValidation(model)

# create subfolder for evaluation data
model_eval_path = os.path.join(model_folder, 'evaluation') 
os.mkdir(model_eval_path) if not os.path.exists(model_eval_path) else None

# evaluate the model on both the train and test dataset
eval_batch_size = CONFIG["evaluation"]["BATCH_SIZE"]
train_eval = model.evaluate(train_dataset, batch_size=eval_batch_size, verbose=1)
validation_eval = model.evaluate(validation_dataset, batch_size=eval_batch_size, verbose=1)

print('\nTrain dataset:')
print(f'Loss: {train_eval[0]}')    
print(f'Metric: {train_eval[1]}')  
print('\nTest dataset:')
print(f'Loss: {validation_eval[0]}')    
print(f'Metric: {validation_eval[1]}')    