### This is the pre-training of color representation model
- It takes 5-10mins on GPU (Tesla T4 * 1) within 100 epochs for one time training
  (reset batch-size=2048 in model_config)
- For a quick start, do Pretraining with pre-created color corpus files
- To create color corpus for training, read the metadata of multipalette and represent colors with bins
    - In this work, we represent CIELab color data into 16 bins

In [2]:
import os
import math
import tensorflow as tf
import pandas as pd
import ast
from collections import defaultdict  # For word frequency

import sys
sys.path.append('../src/colorbert')

import color_bert_model as Model
from input_data_generator import DataGenerator
from model_config import Config

from datetime import datetime


2022-09-16 16:46:59.900327: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Create color corpus for training

In [3]:
# Represent color with bins (bin_range = 16 <16bins> vocabulary size: max 4096)
bin_range = Config['bin_range']
representation = Config['representation']
column_names = ['image_colors_lab', 'svg_colors_lab', 'text_colors_lab']

def get_color_list_bins(data, column_names):
    color_hist = ''
    for column in column_names:
        if pd.notna(data[column]):
            colors = ast.literal_eval(data[column])
            for color in colors:
                if color_hist != '':
                    color_hist += ' '
                color_hist += f'{math.floor(color[0]/bin_range)}_{math.floor(color[1]/bin_range)}_{math.floor(color[2]/bin_range)}'
    return color_hist

def get_color_metadata(data, representation):

    for column in column_names:
        data[f'{column}'] = data.apply(lambda x: get_color_list_bins(x, [column]), axis=1)
        
    return data

def get_color_hist(data, column_names):
    color_hist = ''
    color_hist += f'{data[column_names[0]]} ; {data[column_names[1]]} ; {data[column_names[2]]}'

    return color_hist

def create_colordata(file_path, representation):
    data = pd.read_csv(file_path)
    data = data.reset_index(drop=True)
    
    metadata = get_color_metadata(data, representation)
    metadata['color_hist'] = metadata.apply(lambda x: get_color_hist(x, column_names), axis=1)
    return metadata

In [4]:
dataTypes = ['train', 'val', 'test']
data_path = '../data/training_data'
for dataType in dataTypes:
    metadata = create_colordata(f'{data_path}/metadata_colors/crello_colors_{dataType}_sklearn_lab.csv', representation)
    metadata['color_hist'].to_csv(f'{data_path}/data_bert/data_color/color_corpus_lab_bins_16_{dataType}_sklearn.txt', header=None, index=None, sep=' ')
    if dataType == 'train':
        sentences = [row.split(' ') for row in metadata['color_hist']]
        color_freq = defaultdict(int)
        for sent in sentences:
            for i in sent:
                color_freq[i] += 1
        color_freq.pop(';')
        
        colors = [a for a in color_freq]
        print(f'vocabulary size: {len(colors)}')
        with open(f'{data_path}/data_bert/data_color/color_vocab_lab_bins_16_{dataType}_sklearn.txt', 'w') as f:
            f.write("[")
            for i in range(len(colors)):
                f.write("'%s'," % colors[i]) if i != len(colors) - 1 else f.write("'%s'" % colors[i])
            f.write("]")

vocabulary size: 796


### Pretraining

In [5]:
def calculate_pretrain_task_accuracy(mlm_predict, batch_mlm_mask, origin_x):

    batch_mlm_mask = tf.cast(batch_mlm_mask, dtype=tf.int32)
    index = tf.where(batch_mlm_mask == 1)
    x_predict = tf.math.argmax(mlm_predict, axis=-1) # top1
    x_predict = tf.gather_nd(x_predict, index)
    x_real = tf.gather_nd(origin_x, index)
    mlm_accuracy = tf.keras.metrics.Accuracy()
    mlm_accuracy.update_state(x_predict, x_real)
    mlm_accuracy = mlm_accuracy.result().numpy()

    return mlm_accuracy

In [6]:
# pretrain

# training on CPU
physical_devices = tf.config.experimental.list_physical_devices('CPU')
assert len(physical_devices) > 0, "Not enough CPU hardware devices available"

# training on GPU
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
# assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

model = Model.Bert(Config)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)
loss_fn = Model.BERT_Loss()
dataset = DataGenerator(Config)
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.restore(tf.train.latest_checkpoint(Config['saved_weight']))
manager = tf.train.CheckpointManager(checkpoint, directory=Config['saved_weight'], max_to_keep=5)
log_dir = os.path.join(Config['log_dir'], datetime.now().strftime("%Y-%m-%d"))
writer = tf.summary.create_file_writer(log_dir)


# create the data for validation and test
PROJECT_PATH = Config['project_path']
Config_val = Config.copy()
Config_val['corpus_file_path'] = os.path.join(PROJECT_PATH, f'data_color/color_corpus_lab_bins_16_val_sklearn.txt')
dataset_val = DataGenerator(Config_val)

Config_test = Config.copy()
Config_test['corpus_file_path'] = os.path.join(PROJECT_PATH, f'data_color/color_corpus_lab_bins_16_test_sklearn.txt')
dataset_test = DataGenerator(Config_test)

patience = 30
best = math.inf
wait = 0

for n in range(1):
    EPOCH = 2 # 100 is enough
    for epoch in range(EPOCH):
    #     print(f'dataset length: {len(dataset)}')
        for step in range(len(dataset)):
            batch_x, batch_mlm_mask, batch_mcc_mask, origin_x, batch_segment, batch_padding_mask = dataset[step]
            with tf.GradientTape() as t:
                mlm_predict, sequence_output = model((batch_x, batch_mlm_mask, batch_segment), training=True)

                mlm_loss = loss_fn((mlm_predict, batch_mlm_mask, origin_x))
                mlm_loss = tf.reduce_mean(mlm_loss)

                loss = mlm_loss

            gradients = t.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            # get acc of random mask
            mlm_acc = calculate_pretrain_task_accuracy(mlm_predict, batch_mlm_mask, origin_x)

            if step == len(dataset) - 1 and epoch % 1 == 0:
                print(
                    'Epoch {}, step {}, loss {:.4f}, mlm_loss {:.4f}, mlm_acc {:.4f}'.format(
                        epoch, step, loss.numpy(),
                        mlm_loss.numpy(),
                        mlm_acc,
                        ))

        for val_step in range(len(dataset_val)):
            val_batch_x, val_batch_mlm_mask, val_batch_mcc_mask, val_origin_x, val_batch_segment, val_batch_padding_mask = dataset_val[val_step]
            val_mlm_predict, val_sequence_output = model((val_batch_x, val_batch_mlm_mask, val_batch_segment), training=False)

            val_mlm_loss = loss_fn((val_mlm_predict, val_batch_mlm_mask, val_origin_x))
            val_mlm_loss = tf.reduce_mean(val_mlm_loss)

            # get acc of random mask
            val_mlm_acc = calculate_pretrain_task_accuracy(val_mlm_predict, val_batch_mlm_mask, val_origin_x)

            val_loss = val_mlm_loss

            if val_step == len(dataset_val) - 1 and epoch % 1 == 0:
                print(
                    'Val: Epoch {}, step {}, loss {:.4f}, mlm_loss {:.4f}, mlm_acc {:.4f}'.format(
                        epoch, val_step, val_loss.numpy(),
                        val_mlm_loss.numpy(),
                        val_mlm_acc,
                        ))

        path = manager.save(checkpoint_number=epoch)

        # early stopping
        wait += 1
        if val_loss < best:
            best = val_loss
            wait = 0
        if wait >= patience:
            break

    Config['mask_rate'] = 0
    for test_step in range(len(dataset_test)):
        test_batch_x, test_batch_mlm_mask, test_batch_mcc_mask, test_origin_x, test_batch_segment, test_batch_padding_mask = dataset_test[test_step]
        test_mlm_predict, test_sequence_output = model((test_batch_x, test_batch_mlm_mask, test_batch_segment), training=False)

        test_mlm_loss = loss_fn((test_mlm_predict, test_batch_mlm_mask, test_origin_x))
        test_mlm_loss = tf.reduce_mean(test_mlm_loss)

        # get acc of random mask
        test_mlm_acc = calculate_pretrain_task_accuracy(test_mlm_predict, test_batch_mlm_mask, test_origin_x)

        test_loss = test_mlm_loss

        if test_step == len(dataset_test) - 1:
            print(
                'Test: Epoch {}, step {}, loss {:.4f}, mlm_loss {:.4f}, mlm_acc {:.4f}'.format(
                    epoch, test_step, test_loss.numpy(),
                    test_mlm_loss.numpy(),
                    test_mlm_acc,
                    ))

    # model.save(f'../data/trained_models/bert_{representation}_{Config['mask_rate']}_{Config['mask_token_rate']}_{n}')

2022-09-16 16:47:15.773648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 0, step 292, loss 0.7378, mlm_loss 0.7378, mlm_acc 0.1364
Val: Epoch 0, step 35, loss 0.7569, mlm_loss 0.7569, mlm_acc 0.2273
Epoch 1, step 292, loss 0.8263, mlm_loss 0.8263, mlm_acc 0.1481
Val: Epoch 1, step 35, loss 0.8318, mlm_loss 0.8318, mlm_acc 0.0870
Test: Epoch 1, step 34, loss 0.8415, mlm_loss 0.8415, mlm_acc 0.0800
