In [33]:
# Streaming the dataset is another option, but too slow in my experience.

#https://huggingface.co/datasets/laion/laion400m
#https://github.com/rom1504/img2dataset/issues/449
#https://www.kaggle.com/datasets/romainbeaumont/laion400m
#https://github.com/rom1504/img2dataset
#if streaming, use high performance dns resolver: https://github.com/rom1504/img2dataset#setting-up-a-high-performance-dns-resolver

#how to do text search through hf dataset: https://huggingface.co/docs/dataset-viewer/en/search

from pathlib import Path
import re
import subprocess
import argparse
import numpy as np
import pandas as pd

from img2dataset import download

# for my setup, at least, pyarrow needs to be installed from conda-forge rather than pip due to some discrepancy in the versions
# and compatibility with HuggingFace.
# To keep things within a uv environment, I copied the relevant files from a conda environment into uv's .venv/bin. 
#try:
from datasets import load_dataset
import datasets
#except ModuleNotFoundError:
    #subprocess.run("conda install -c conda-forge -y  datasets pyarrow libparquet", shell=True)
    #from datasets import load_dataset

#from huggingface_hub import HfFolder, whoami

rng = np.random.default_rng(seed=1234)

# uncomment below to sign in to HuggingFace
# login can be done with huggingface_hub.login() in Python or huggingface-cli login in CLI
#token = HfFolder.get_token()

#if token is None:
#    try:
#        subprocess.run("huggingface-cli login", shell=True)
#    except Exception as e:
#        print(f"Unable to call 'huggingface-cli login' in shell: {e}.")
#else:
#    try:
#        user_info = whoami()
#        print(f"Logged into HuggingFace as: {user_info['name']}")
#    except Exception as e:
#        print(f"Unable to identify user from HuggingFace token: {e}")
#
#assert HfFolder.get_token() is not None, "No authentication token for HuggingFace found, aborting."

In [41]:
def filter_function(to_search, search_terms) -> bool:
    if not to_search:
        return False
    to_search = to_search.lower()
    terminators = [" ", "\n" ".", ",", ":", ";"]
    for term in search_terms:
        term = term.lower()
        for t in terminators:
            if f" {term}{t}" in to_search:
                return True
            elif f" {term}s{t}" in to_search: 
                return True
    return False
        

def search_dataset(dataset: datasets.Dataset, search_terms, column="caption", start_from=0, up_to=10000) -> datasets.Dataset:
    # TODO: randomize access to dataset
    if isinstance(search_terms, str):
        search_terms = [search_terms]
    if up_to == "full":
        up_to = dataset.num_rows
    elif up_to == "half":
        up_to = dataset.num_rows // 2
    elif up_to == "quarter":
        up_to = dataset.num_rows // 4
    elif up_to == "tenth":
        up_to = dataset.num_rows // 10
    up_to = min(start_from+up_to, dataset.num_rows)
    print(f"Filtering from row {start_from} to row {up_to} of the dataset")
    return dataset.select([i for i in range(up_to)]).filter(lambda row: filter_function(row[column], search_terms))

def get_n(dataset: datasets.Dataset, n: int, exclusions=None) -> (datasets.Dataset, list):
    rand_indices = rng.choice(dataset.num_rows, size=n, replace=False)
    if exclusions:
        assert isinstance(exclusions, np.ndarray), "excluded indices must be passed as np array"
        while np.intersect1d(rand_indices, exclusions).size > 0:
            to_replace = np.intersect1d(rand_indices, exclusions, return_indices=True)[1]
            rand_indices[to_replace] = rng.choice(dataset.num_rows, size=to_replace.size, replace=False) 
            while(np.unique(rand_indices).size != rand_indices.size):
                to_keep = np.unique(rand_indices, return_index=True)[1]
                mask = np.ones(rand_indices.shape, bool)
                mask[to_keep] = False
                rand_indices[mask] = rng.choice(dataset.num_rows, size=rand_indices[mask].size, replace=False)
    return dataset.select(rand_indices), rand_indices

def img_from_dataset(dataset: datasets.Dataset, dest_path: str):
    # first save dataset to new parquet file to hidden version of specified path
    path = Path(dest_path)
    path.mkdir(exist_ok=True)
    (path / "selection").mkdir(exist_ok=True)
    metadata_path = path / "selection/01.parquet"
    dataset.to_parquet(metadata_path)
    output_path = path / "output"
    download(
        processes_count=8, 
        thread_count=16, 
        url_list=str(metadata_path),
        resize_mode="no",
        output_folder=str(output_path),
        output_format="files",
        input_format="parquet",
        url_col="url",
        caption_col="caption",
        enable_wandb=True,
        number_sample_per_shard=1000,
        distributor="multiprocessing"
    )

In [9]:
path = Path("./laion400m")
data_files = [str(file) for file in path.glob("part*.parquet")]
dataset = load_dataset("parquet", data_files=data_files)

Resolving data files:   0%|          | 0/128 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/128 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/128 [00:00<?, ?it/s]

In [10]:
search_results = search_dataset(dataset["train"], "car", up_to="tenth")

Filtering from row 0 to row 36102061 of the dataset


Filter:   0%|          | 0/36102061 [00:00<?, ? examples/s]

In [11]:
selection = get_n(search_results, 100)

In [42]:
img_from_dataset(selection[0], "./testing_dir")

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Starting the downloading of this file
Sharding file number 1 of 1 called /home/andrew/Documents/raw_images_image_priv/testing_dir/selection/01.parquet


0it [00:00, ?it/s]

File sharded in 1 shards
Downloading starting now, check your bandwidth speed (with bwm-ng)your cpu (with htop), and your disk usage (with iotop)!


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: acmayo to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.19.8
wandb: Run data is saved locally in /home/andrew/Documents/raw_images_image_priv/wandb/run-20250331_180234-gy41xlen
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run winter-gorge-5
wandb: ⭐️ View project at https://wandb.ai/acmayo/img2dataset?apiKey=85a264506652d736bafbfaeeb161c262630fa4b9
wandb: 🚀 View run at https://wandb.ai/acmayo/img2dataset/runs/gy41xlen?apiKey=85a264506652d736bafbfaeeb161c262630fa4b9
1it [00:06,  6.10s/it]
wandb:                                                                                
wandb: 
wandb: Run history:
wandb:               total/count ▁
wandb:  total/failed_to_download ▁
wandb:    total/failed_to_resize ▁
wandb:         total/img_per_sec ▁
wandb:             total/succe

worker  - success: 0.810 - failed to download: 0.180 - failed to resize: 0.010 - images per sec: 21 - count: 100
total   - success: 0.810 - failed to download: 0.180 - failed to resize: 0.010 - images per sec: 21 - count: 100
