In [2]:
import datasets

ds = datasets.load_dataset('nyuuzyou/svgfind', split='creativecommons')

Downloading data: 100%|██████████| 37/37 [07:57<00:00, 12.91s/files]
Generating creativecommons split: 3645444 examples [01:29, 40659.45 examples/s]
Generating publicdomain split: 10366 examples [00:00, 37729.43 examples/s]


In [9]:
import pandas as pd

def get_tag_stats(data, tag_col="tags", top_n=10):
    """
    data     : list[dict] | pd.DataFrame — ваш набор с колонкой tag_col
    tag_col  : str        — имя колонки, где лежит список тегов
    top_n    : int        — сколько самых популярных тегов вернуть
    
    Returns: dict with basic stats + pd.Series of top tags
    """
    # 1) Ensure we have a DataFrame
    df = pd.DataFrame(data) if not isinstance(data, pd.DataFrame) else data
    
    # 2) Explode tags so each tag is its own row
    exploded = df[tag_col].explode()
    
    # 3) Frequency for every tag
    tag_freq = exploded.value_counts()
    
    # 4) Per-row tag counts
    per_row = df[tag_col].apply(len)
    
    # 5) Assemble stats
    stats = {
        "rows"               : len(df),
        "unique_tags"        : tag_freq.size,
        "mean_tags_per_row"  : per_row.mean(),
        "median_tags_per_row": per_row.median(),
        "max_tags_per_row"   : per_row.max(),
        "top_tags"           : tag_freq.head(top_n)
    }
    return stats

# --- Example usage ---
stats = get_tag_stats(ds.shuffle().take(100000), tag_col="tags", top_n=50)
print(stats["top_tags"])

tags
security         1417
data             1400
online           1364
time             1198
storage          1148
phone            1068
computer         1043
love             1028
Arrow            1024
business         1019
email             934
superhero         902
music             851
internet          822
communication     764
construction      729
software          718
digital           707
photography       703
travel            702
writing           701
health            694
food              675
chat              667
paper             654
messaging         652
Camera            646
space             632
astronomy         597
shopping          590
design            586
social media      581
DC                578
money             571
audio             530
light             527
sun               510
sweet             499
clothing          490
text              487
art               475
home              471
House             464
Doctor            458
safe              441
photo

In [28]:
from functools import partial
# --- настройки ---------------------------------------------------------
chosen = {'phone', 'love', 'food', 'sun', 'photography'}   # set → O(1) lookup

def has_only_one(example, focus, pool):
    """
    True  → в example['tags'] ровно ОДИН тег из pool, и это focus  
    False → либо focus нет, либо найден ещё один «чужой» тег из pool
    """
    hits = 0
    for tag in example["tags"]:
        if tag in pool:
            hits += 1
            if hits > 1:
                return False
    return hits == 1 and focus in example["tags"]

# --- фильтруем без перекрытий ------------------------------------------
datasets_for_class = {
    cls: ds.filter(partial(has_only_one, focus=cls, pool=chosen), num_proc=4) for cls in chosen
}


Filter (num_proc=4): 100%|██████████| 3645444/3645444 [00:15<00:00, 242245.49 examples/s]
Filter (num_proc=4): 100%|██████████| 3645444/3645444 [00:15<00:00, 242152.91 examples/s]
Filter (num_proc=4): 100%|██████████| 3645444/3645444 [00:15<00:00, 240371.53 examples/s]
Filter (num_proc=4): 100%|██████████| 3645444/3645444 [00:15<00:00, 239062.08 examples/s]
Filter (num_proc=4): 100%|██████████| 3645444/3645444 [00:15<00:00, 238944.13 examples/s]


In [33]:
for one_class in datasets_for_class:
    datasets_for_class[one_class] = datasets_for_class[one_class].map(lambda x: {'class': one_class})

Map: 100%|██████████| 26358/26358 [00:03<00:00, 7799.08 examples/s] 
Map: 100%|██████████| 23438/23438 [00:03<00:00, 7306.89 examples/s] 
Map: 100%|██████████| 16551/16551 [00:02<00:00, 7258.31 examples/s] 
Map: 100%|██████████| 19191/19191 [00:02<00:00, 8395.10 examples/s] 
Map: 100%|██████████| 21462/21462 [00:02<00:00, 8297.99 examples/s] 


In [39]:
datasets_for_class[one_class]

Dataset({
    features: ['id', 'title', 'data_pack', 'tags', 'license', 'license_owner', 'download_url', 'svg_content', 'class'],
    num_rows: 21462
})

In [38]:
import pandas as pd

ds_chosen = datasets.concatenate_datasets(list(datasets_for_class.values()))
df_classes = pd.DataFrame(ds_chosen)

In [61]:
df_classes_small  = df_classes[['svg_content', 'class']].sample(5000)
df_classes_small.head()

Unnamed: 0,svg_content,class
44829,"<svg fill=""#000"" width=""800"" height=""800"" view...",food
87792,"<svg fill=""#000"" width=""800"" height=""800"" view...",phone
64049,"<svg fill=""#000"" width=""800"" height=""800"" view...",sun
21560,"<svg fill=""#000"" width=""800"" height=""800"" view...",love
50986,"<svg fill=""#000"" width=""800"" height=""800"" view...",sun


In [62]:
ds_small = datasets.Dataset.from_pandas(df_classes_small).rename_column('__index_level_0__', 'id').select_columns(['id', 'svg_content', 'class'])

In [76]:
from datasets import ClassLabel

# 1) собираем уникальные названия классов
label_names = sorted(set(ds_small['class']))          # → ['food', 'love', 'phone', 'photography', 'sun']

# 2) создаём объект ClassLabel
class_feature = ClassLabel(num_classes=len(label_names), names=label_names)

# 3) кастуем колонку
#    map() переводит строку → id, потом cast_column меняет тип
name2id = {n:i for i, n in enumerate(label_names)}
ds_small = ds_small.map(lambda ex: {'class': name2id[ex['class']]})
ds_small = ds_small.cast_column('class', class_feature)

# 4) теперь работает stratify_by_column!
ds_small_train, ds_small_test = ds_small.train_test_split(
    test_size=500,
    stratify_by_column='class',
    seed=42                            # 固定ируем порядок — reproducible
).values()

ds_small_train, ds_small_val = ds_small_train.train_test_split(
    test_size=500,
    stratify_by_column='class',
    seed=42
).values()

ds_dict = datasets.DatasetDict({
    'train': ds_small_train,
    'val'  : ds_small_val,
    'test' : ds_small_test
})

print(ds_dict)

Map: 100%|██████████| 5000/5000 [00:00<00:00, 21715.04 examples/s]
Casting the dataset: 100%|██████████| 5000/5000 [00:00<00:00, 308767.96 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'svg_content', 'class'],
        num_rows: 4000
    })
    val: Dataset({
        features: ['id', 'svg_content', 'class'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'svg_content', 'class'],
        num_rows: 500
    })
})





In [82]:
ds_dict.push_to_hub('VectorGraphics/svg-super-glue', config_name='multi-class-classification')

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 25.97ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 45.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 31.43ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/VectorGraphics/svg-super-glue/commit/2f14baa7c7dfba2e67e9fd06110dcede7acac23c', commit_message='Upload dataset', commit_description='', oid='2f14baa7c7dfba2e67e9fd06110dcede7acac23c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/VectorGraphics/svg-super-glue', endpoint='https://huggingface.co', repo_type='dataset', repo_id='VectorGraphics/svg-super-glue'), pr_revision=None, pr_num=None)

In [None]:
ds_dict