### Prepare the color_text dataset for multimodal color masked model input
- Represent color with bins (bin_range = 16 <16bins> vocabulary max 4096; bin_range = 32 <8bins> vocabulary max 512)
    - Format: color palette for image (max 5 colors)
- Represent text with pre-trained LLM

In [6]:
import pandas as pd
from collections import defaultdict  # For word frequency
import math
import random
import ast
from datetime import datetime
from collections import Counter
import numpy as np

import sys
sys.path.append('../src')

from full_palette_generation.utils.text_emb_creator import save_text_embedding_clip

representation = 'lab_bins_16'
bin_range = 16

clusterMode = 'lab_' # training data created by lab color space
dataTypes = ['train', 'val', 'test']

rawdata_path = '../data/data_colors'
color_data_path = '../data/data_t2p/Data_color'
text_data_path = '../data/data_t2p/Data_text'

text_model = '_clip'
emb_file = 'emb_clip'

In [7]:
def get_color_list_bins(data, column_names):
    color_hist = ''
    for column in column_names:
        if pd.notna(data[column]):
            colors = ast.literal_eval(data[column])
            for color in colors:
                if color_hist != '':
                    color_hist += ' '
                color_hist += f'{math.floor(color[0]/bin_range)}_{math.floor(color[1]/bin_range)}_{math.floor(color[2]/bin_range)}'
    return color_hist

In [8]:
column_names = ['palette_lab_reorder']

def get_color_metadata(data, representation):

    for column in column_names:
        data[f'{column}'] = data.apply(lambda x: get_color_list_bins(x, [column]), axis=1)
        
    return data

def get_color_hist(data, column_names):
    color_hist = ''
    color_hist += f'{data[column_names[0]]}'

    return color_hist

def create_colordata(file_path, representation):
    data = pd.read_csv(file_path)
    data = data.reset_index(drop=True)
    
    metadata = get_color_metadata(data, representation)
    metadata['color_hist'] = metadata.apply(lambda x: get_color_hist(x, column_names), axis=1)
    return metadata

#### Create color corpus and text data: train/val/test

In [9]:
for dataType in dataTypes:
    metadata = create_colordata(f'{rawdata_path}/palette_and_text_{dataType}.csv', representation)
    print(metadata.shape)

    # create color data
    metadata['color_hist'].to_csv(f'{color_data_path}/color_corpus_{representation}_{dataType}.txt', header=None, index=None, sep=' ')
    # create color vocab from train data
    if dataType == 'train':
        metadata_color_hist = pd.read_csv(f'{color_data_path}/color_corpus_{representation}_{dataType}.txt', header=None)

        # create sentences
        sentences = [row.split(' ') for row in metadata['color_hist']]
        color_freq = defaultdict(int)
        for sent in sentences:
            for i in sent:
                color_freq[i] += 1
        # color_freq.pop(';')
        print(f'color freq size: {len(color_freq)}')
        colors = [a for a in color_freq]
        # colors.remove('\n')
        print(f'color vocab size: {len(colors)}')
        with open(f'{color_data_path}/color_vocab_{representation}_{dataType}.txt', 'w') as f:
            f.write("[")
            for i in range(len(colors)):
                f.write("'%s'," % colors[i]) if i != len(colors) - 1 else f.write("'%s'" % colors[i])
            f.write("]")

    # create text data
    metadata['text_input'].to_csv(f'{text_data_path}/text_input_{dataType}.txt', header=None, index=None, sep=' ')


(8147, 5)
color freq size: 813
color vocab size: 813
(1018, 5)
(1018, 5)


#### Create text embedding for text data

In [10]:
# create text embedding and save
def make_and_parse_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        texts_ = f.readlines()
    for line in texts_:
        yield line

def make_text_data(file_path, dataType):
    text_input = []
    
    text_input_ = make_and_parse_text(file_path)
    for tc in text_input_:
        contents = tc.strip('[]"\n').split(',')

        # separate each phrase as 1 text content
        contents = [c.replace('\'', '').replace('\\n', '. ') for c in contents] # use '.' between sentences of different lines
        text_input.append(contents)
        
    data_path = f"{text_data_path}/{emb_file}"
    print(f'start build {dataType} text contents embedding: {datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}')
    text_contents_emb = save_text_embedding_clip(text_input, data_path, 'text_input', dataType)
    print(f'finish build {dataType} text contents embedding: {datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}')


if __name__ == '__main__':
    for dataType in dataTypes:
        file_path = f'{text_data_path}/text_input_{dataType}.txt'
        make_text_data(file_path, dataType) # for seq text embedding building
    

start build train text contents embedding: 2023-09-04-10:16:34
100->200->300->400->500->600->700->800->900->1000->1100->1200->1300->1400->1500->1600->1700->1800->1900->2000->2100->2200->2300->2400->2500->2600->2700->2800->2900->3000->3100->3200->3300->3400->3500->3600->3700->3800->3900->4000->4100->4200->4300->4400->4500->4600->4700->4800->4900->5000->5100->5200->5300->5400->5500->5600->5700->5800->5900->6000->6100->6200->6300->6400->6500->6600->6700->6800->6900->7000->7100->7200->7300->7400->7500->7600->7700->7800->7900->8000->8100->finish build train text contents embedding: 2023-09-04-10:17:52
start build val text contents embedding: 2023-09-04-10:17:52
100->200->300->400->500->600->700->800->900->1000->finish build val text contents embedding: 2023-09-04-10:18:04
start build test text contents embedding: 2023-09-04-10:18:04
100->200->300->400->500->600->700->800->900->1000->finish build test text contents embedding: 2023-09-04-10:18:15


In [11]:
# check the created embedding file
text_input_emb_ = np.loadtxt(f'../data/data_t2p/Data_text/{emb_file}/text_input_emb_clip_test.txt', dtype=float)
text_input_emb_.shape

(1018, 512)