In [None]:
import os
import pandas as pd
from datasets import load_dataset
from PIL import Image

from tqdm import tqdm
from io import BytesIO
import requests
from concurrent.futures import ThreadPoolExecutor

In [7]:
dataset = load_dataset("RobinWZQ/improved_aesthetics_6.5plus", split="train")

dataset = dataset.shuffle(seed=1).select(range(10000))

In [8]:
df = pd.DataFrame({
    "url": dataset["URL"],
    "aesthetic_score": dataset["AESTHETIC_SCORE"]
})

df = df.dropna(subset=["aesthetic_score"]).reset_index(drop=True)

print("Количество изображений:", len(df))
df.head()

Количество изображений: 10000


Unnamed: 0,url,aesthetic_score
0,https://cdn.artfunnels.com/a0077501-5213-4368-...,6.516629
1,https://f.vividscreen.info/soft/a0cb03910a6629...,6.621181
2,https://pix.avax.news/avaxnews/7c/10/0005107c_...,6.59829
3,https://c1.35photo.pro/photos_temp/sizes/353/1...,6.668599
4,https://www.createcg.net/wp-content/uploads/20...,6.635511


In [12]:
df['aesthetic_score'].min()

np.float64(6.50000524520874)

In [16]:
os.makedirs("images", exist_ok=True)

def download_image(url, idx):
    try:
        r = requests.get(url, timeout=5)
        r.raise_for_status()
        img = Image.open(BytesIO(r.content)).convert("RGB")

        path = os.path.join("images", f"img_{idx}.jpg")
        img.save(path, format="JPEG", quality=90)
        return path
    except Exception:
        return None



In [19]:
def download_wrapper(args):
    idx, url = args
    return download_image(url, idx)

urls = list(df["url"])

paths = []
with ThreadPoolExecutor(max_workers=32) as ex:
    for path in tqdm(ex.map(download_wrapper, enumerate(urls)), total=len(urls)):
        paths.append(path)

df["path"] = paths
df = df.dropna(subset=["path"]).reset_index(drop=True)

print("Успешно скачано изображений:", len(df))
df.head()


100%|██████████| 10000/10000 [26:17<00:00,  6.34it/s]

Успешно скачано изображений: 6716





Unnamed: 0,url,aesthetic_score,path
0,https://cdn.artfunnels.com/a0077501-5213-4368-...,6.516629,images\img_0.jpg
1,https://pix.avax.news/avaxnews/7c/10/0005107c_...,6.59829,images\img_2.jpg
2,https://c1.35photo.pro/photos_temp/sizes/353/1...,6.668599,images\img_3.jpg
3,https://images.robertharding.com/preview/RF/RH...,6.564098,images\img_5.jpg
4,https://live.staticflickr.com/65535/4140238487...,6.87373,images\img_6.jpg
