In [9]:
import polars as pl
import json 
import os
from PIL import Image
import tarfile
import operator
from functools import reduce
from tqdm import tqdm

pl.Config.set_fmt_str_lengths(200)

polars.config.Config

In [16]:
PATH = '/mnt/sd1tb/tinydiffusion/dataset_v1/'

In [55]:
def build_real_images_consolidated_data(path):
    path_external_imgs = path + 'imgs/'
    dataframes = []
    for filename in os.listdir(path_external_imgs):
        if filename.endswith(".parquet"):
            parquet_path = os.path.join(path_external_imgs, filename)
            path_shard = parquet_path.split('.')[:-1][0].replace('dataset_v0', 'dataset_v1')
            df = pl.read_parquet(parquet_path)
            df = df.with_columns(
                pl.col('key').map_elements(
                    lambda x: os.path.join(path_shard, f'{x}.jpg'),
                    return_dtype=pl.Utf8
                ).alias("path")
            )
            dataframes.append(df)
    df_final = pl.concat(dataframes).filter(pl.col('status')=='success')
    return df_final

def build_synthetic_images_consolidated_data(path):
    path_data = path + 'diffusiondb/'
    parts = os.listdir(path_data)
    list_data = []
    for part in tqdm(parts):
        data = part_data = json.load(open(f'{path_data}{part}/{part}.json'))
        for record in data:
            list_data.append({
                'id': record,
                'path': f'{path_data}{part}/{record}',
                'caption': data[record]['p']
            })
    df = pl.from_dicts(list_data).unique('caption')
    return df

In [56]:
# df_real_images = build_real_images_consolidated_data(PATH)
df_synthetic_images = build_synthetic_images_consolidated_data(PATH)

100%|██████████| 1000/1000 [00:00<00:00, 1308.70it/s]


In [69]:
df = pl.read_parquet(PATH + 'dataset_gold.parquet')
df.head(3)

id,path,caption,source
str,str,str,str
"""001593561.jpg""","""/mnt/sd1tb/tinydiffusion/dataset_v1/imgs/00114/001140009.jpg""","""a serene and picturesque forest scene with a stream of water flowing through it. The water appears to be murky, adding to the natural atmosphere of the scene. There are several trees surrounding the s…","""real"""
"""003427400.jpg""","""/mnt/sd1tb/tinydiffusion/dataset_v1/imgs/00114/001140006.jpg""","""a man dressed in royal attire, standing in a room with a red carpet. He is wearing a crown and holding a scepter, giving the impression of a king or a high-ranking official. The man is also wearing a …","""real"""
"""004964454.jpg""","""/mnt/sd1tb/tinydiffusion/dataset_v1/imgs/00114/001140032.jpg""","""a black t-shirt with a skull graphic on it. The skull is surrounded by three roses, each with a different color. The t-shirt is designed to be worn by a man, and the image is a close-up of the shirt. …","""real"""


In [80]:
import numpy as np
# Asignar "train" a todas las filas con 'source' == "synthetic"
df = df.with_columns(
    pl.when(pl.col("source") != "real")
    .then(pl.lit("train"))
    .otherwise(pl.lit(None))
    .alias("set")
)

df_real = df.filter(pl.col('source')=='real')
idxs = np.arange(len(df_real))
np.random.shuffle(idxs)
tr_idxs, val_idxs = idxs[:-1024], idxs[-1024:]
df_train = pl.concat([
    df_real[tr_idxs],
    df.filter(pl.col('set')=='train')
])
df_val = df_real[val_idxs]

assert len(df_train)+len(df_val)==len(df)

array([ 421723,  221712,  605468, ..., 1055353,  282139, 1005798])