In [1]:
import datasets
from optimization.optimization import optimize_svg_content as opt


ds = datasets.load_dataset("starvector/svg-fonts-simple")

ds_letters = {}

for item in range(ord('a'), ord('z')+1):
    letter = chr(item)
    ds_letters[letter] = ds['train'].filter(lambda x: x['Filename'].endswith(f'{letter}_lower'))
    
ds_letters_a = ds_letters['a']
ds_letters_a = ds_letters_a.map(lambda x: {'len': len(x['Svg'])})

ds_letters_b = ds_letters['b']
ds_letters_b = ds_letters_b.map(lambda x: {'len': len(x['Svg'])})

ds_letters_a = ds_letters_a.shuffle().take(3000).map(lambda x: {'svg_optimized': opt(x['Svg'], quiet=True)['optimized']}, num_proc=16)
ds_letters_b = ds_letters_b.shuffle().take(3000).map(lambda x: {'svg_optimized': opt(x['Svg'], quiet=True)['optimized']}, num_proc=16)

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=16): 100%|██████████| 3000/3000 [01:25<00:00, 34.98 examples/s]
Map (num_proc=16): 100%|██████████| 3000/3000 [01:26<00:00, 34.71 examples/s]


In [59]:
ds_letters_a = ds_letters_a.map(lambda x: {'letter': 'a'})
ds_letters_b = ds_letters_b.map(lambda x: {'letter': 'b'})


Map: 100%|██████████| 2985/2985 [00:00<00:00, 13209.94 examples/s]
Map: 100%|██████████| 2986/2986 [00:00<00:00, 11187.94 examples/s]


In [60]:
ds_letters_a = ds_letters_a.filter(lambda x: x['svg_optimized'] != None)
ds_letters_b = ds_letters_b.filter(lambda x: x['svg_optimized'] != None)


Filter: 100%|██████████| 2985/2985 [00:00<00:00, 129013.62 examples/s]
Filter: 100%|██████████| 2986/2986 [00:00<00:00, 95431.14 examples/s]


In [61]:
ds_letters_ab = datasets.concatenate_datasets([ds_letters_a, ds_letters_b])
ds_letters_ab = ds_letters_ab.shuffle()

ds_letters_ab = ds_letters_ab.select_columns(['Filename','svg_optimized', 'letter'])
ds_letters_ab = ds_letters_ab.rename_column('Filename', 'id')

In [68]:
import pandas as pd

ds_letters_ab = datasets.Dataset.from_pandas(pd.DataFrame(ds_letters_ab).drop_duplicates(subset=['svg_optimized']))
ds_letters_ab = ds_letters_ab.select_columns(['id', 'svg_optimized', 'letter'])

In [72]:
from datasets import ClassLabel

ds_small = ds_letters_ab

# 1) собираем уникальные названия классов
label_names = sorted(set(ds_small['letter']))          # → ['food', 'love', 'phone', 'photography', 'sun']

# 2) создаём объект ClassLabel
class_feature = ClassLabel(num_classes=len(label_names), names=label_names)

# 3) кастуем колонку
#    map() переводит строку → id, потом cast_column меняет тип
name2id = {n:i for i, n in enumerate(label_names)}
ds_small = ds_small.map(lambda ex: {'letter': name2id[ex['letter']]})
ds_small = ds_small.cast_column('letter', class_feature)

# 4) теперь работает stratify_by_column!
ds_small_train, ds_small_test = ds_small.train_test_split(
    test_size=2000,
    stratify_by_column='letter',
    seed=42                            # 固定ируем порядок — reproducible
).values()

ds_small_train, ds_small_val = ds_small_train.train_test_split(
    train_size=3000,
    stratify_by_column='letter',
    seed=42
).values()

ds_dict = datasets.DatasetDict({
    'train': ds_small_train,
    'val'  : ds_small_val,
    'test' : ds_small_test
})

print(ds_dict)

Map: 100%|██████████| 5946/5946 [00:00<00:00, 27561.17 examples/s]
Casting the dataset: 100%|██████████| 5946/5946 [00:00<00:00, 2267418.09 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'svg_optimized', 'letter'],
        num_rows: 3000
    })
    val: Dataset({
        features: ['id', 'svg_optimized', 'letter'],
        num_rows: 946
    })
    test: Dataset({
        features: ['id', 'svg_optimized', 'letter'],
        num_rows: 2000
    })
})





In [73]:
ds_dict.push_to_hub('VectorGraphics/svg-super-glue', config_name='ab-test')

Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 43.47ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 39.05ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 38.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/VectorGraphics/svg-super-glue/commit/ff051bd408f5f65bb5564323f295b3d72192b4ed', commit_message='Upload dataset', commit_description='', oid='ff051bd408f5f65bb5564323f295b3d72192b4ed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/VectorGraphics/svg-super-glue', endpoint='https://huggingface.co', repo_type='dataset', repo_id='VectorGraphics/svg-super-glue'), pr_revision=None, pr_num=None)