### Prepare the color_text dataset for multimodal color masked model input
- Represent color with bins (bin_range = 16 <16bins> vocabulary max 4096; bin_range = 32 <8bins> vocabulary max 512)
    - Format: color palette for image (max 5 colors)
- Represent text with pre-trained LLM

In [1]:
import pandas as pd
from collections import defaultdict  # For word frequency
import math
import random
import ast
from datetime import datetime
from collections import Counter
import numpy as np
import os

import sys
sys.path.append('../src')

from color_palette_completion.utils.text_emb_creator import save_text_embedding_clip
from color_palette_completion.text_color_model.model_config import Config

representation = Config['representation']
bin_range = Config['bin_range']
max_text_seq_length = {
    'text_contents': Config['Max_Text_Contents_Length'],
    'image_labels': Config['Max_Image_Labels_Length']
}

clusterMode = 'lab_' # training data created by lab color space or rgb
kmeansType = '_sklearn'
langType = '_en'
dataTypes = ['train', 'val', 'test']
textTypes = ['text_contents', 'image_labels']

rawdata_path = '../data/colors'
color_data_path = '../data/t2p/color'
text_data_path = '../data/t2p/text'

text_model = '_clip'
emb_file = 'emb_clip_imagemust_seq'

# Check if directory exists
if not os.path.exists(color_data_path):
    os.makedirs(color_data_path)
    
if not os.path.exists(text_data_path):
    os.makedirs(text_data_path)
    
if not os.path.exists(f'{text_data_path}/{emb_file}'):
    os.makedirs(f'{text_data_path}/{emb_file}')


2023-10-23 02:23:16.562164: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0


In [3]:
def get_color_list_bins(data, column_names):
    color_hist = ''
    for column in column_names:
        if pd.notna(data[column]):
            colors = ast.literal_eval(data[column])
            for color in colors:
                if color_hist != '':
                    color_hist += ' '
                color_hist += f'{math.floor(color[0]/bin_range)}_{math.floor(color[1]/bin_range)}_{math.floor(color[2]/bin_range)}'
    return color_hist

In [4]:
column_names = ['image_colors_lab_reorder', 'svg_colors_lab_reorder', 'text_colors_lab_reorder']

def get_color_metadata(data, representation):

    for column in column_names:
        data[f'{column}'] = data.apply(lambda x: get_color_list_bins(x, [column]), axis=1)
        
    return data

def get_color_hist(data, column_names):
    color_hist = ''
    color_hist += f'{data[column_names[0]]} ; {data[column_names[1]]} ; {data[column_names[2]]}'

    return color_hist

def create_colordata(file_path, representation):
    data = pd.read_csv(file_path)
    data = data.reset_index(drop=True)
    
    metadata = get_color_metadata(data, representation)
    metadata['color_hist'] = metadata.apply(lambda x: get_color_hist(x, column_names), axis=1)
    return metadata

#### Create color corpus and text data: train/val/test

In [5]:
for dataType in dataTypes:
    print(dataType)
    metadata = create_colordata(f'{rawdata_path}/data_colors_labels/crello_labels_palette_{dataType}{kmeansType}_hfreqlabels_imagemust{langType}.csv', representation)
    print(metadata.shape)

    # create color data
    metadata['color_hist'].to_csv(f'{color_data_path}/color_corpus_{representation}_{dataType}{kmeansType}.txt', header=None, index=None, sep=' ')
    # create color vocab from train data
    if dataType == 'train':
        metadata_color_hist = pd.read_csv(f'{color_data_path}/color_corpus_{representation}_{dataType}{kmeansType}.txt', header=None)

        # create sentences
        sentences = [row.split(' ') for row in metadata['color_hist']]
        color_freq = defaultdict(int)
        for sent in sentences:
            for i in sent:
                color_freq[i] += 1
        color_freq.pop(';')
        print(f'color freq size: {len(color_freq)}')
        colors = [a for a in color_freq]
        # colors.remove('\n')
        print(f'color vocab size: {len(colors)}')
        with open(f'{color_data_path}/color_vocab_{representation}_{dataType}{kmeansType}.txt', 'w') as f:
            f.write("[")
            for i in range(len(colors)):
                f.write("'%s'," % colors[i]) if i != len(colors) - 1 else f.write("'%s'" % colors[i])
            f.write("]")

    # create text data
    for textType in textTypes:
        metadata[textType].to_csv(f'{text_data_path}/{textType}_imagemust_{dataType}{langType}.txt', header=None, index=None, sep=' ')


train


  This is separate from the ipykernel package so we can avoid doing imports until


(14020, 1567)
color freq size: 758
color vocab size: 758
val


  This is separate from the ipykernel package so we can avoid doing imports until


(1704, 729)
test


  This is separate from the ipykernel package so we can avoid doing imports until


(1712, 659)


#### Create text embedding for text data

In [22]:
# create text embedding and save
def make_and_parse_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        texts_ = f.readlines()
    for line in texts_:
        yield line

def make_text_data(file_path, dataType, textType):
    text_input = []
    
    text_input_ = make_and_parse_text(file_path)
    for tc in text_input_:
        contents = tc.strip('[]"\n').split(',')

        # separate each phrase as 1 text content
        contents = [c.replace('\'', '').replace('\\n', '. ') for c in contents] # use '.' between sentences of different lines
        text_input.append(contents)
        
    data_path = f"{text_data_path}/{emb_file}"
    print(f'start build {dataType} text contents embedding: {datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}')
    text_contents_emb = save_text_embedding_clip(text_input, data_path, textType, dataType, max_text_seq_length[textType])
    print(f'finish build {dataType} text contents embedding: {datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}')


if __name__ == '__main__':
    for dataType in dataTypes:
        file_path = f'{text_data_path}/text_contents_imagemust_{dataType}{langType}.txt'
        make_text_data(file_path, dataType, 'text_contents') # for seq text embedding building
    for dataType in dataTypes:
        file_path = f'{text_data_path}/image_labels_imagemust_{dataType}{langType}.txt'
        make_text_data(file_path, dataType, 'image_labels') # for seq text embedding building
    

In [14]:
# check the created embedding file
text_input_emb_ = np.loadtxt(f'{text_data_path}/{emb_file}/image_labels_hfreq_emb{text_model}_test{langType}.txt', dtype=float)
text_input_emb_.shape

(17120, 512)