In [1]:
import os
from datasets import load_dataset
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
img_dir = r"E:\canada_full_city" 

# Huggin login and repo
hf_token = os.getenv("HF_WRITE")
login(token=hf_token)

dataset_repo_id = "SABR22/Canadian-streetview-cities"

In [3]:
print("Loading dataset")
dataset = load_dataset("imagefolder", data_dir=img_dir)

Loading dataset


Downloading data: 100%|██████████| 150000/150000 [03:05<00:00, 809.26files/s]
Computing checksums: 100%|██████████| 150000/150000 [02:56<00:00, 850.35it/s]
Generating train split: 150000 examples [00:08, 17347.09 examples/s]


In [4]:
# Create train/test split
print("Creating train/test split")
train_test_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="label")

Creating train/test split


In [9]:
# Adding human readable "city" field
def add_city(row):
    label = train_test_dataset["train"].features["label"]
    row["city"] = label.int2str(row["label"])
    return row

train_test_dataset = train_test_dataset.map(add_city)

Map: 100%|██████████| 135000/135000 [00:15<00:00, 8704.33 examples/s]
Map: 100%|██████████| 15000/15000 [00:02<00:00, 5249.13 examples/s]


In [10]:
print("Dataset structure:")
print(train_test_dataset)

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'city'],
        num_rows: 135000
    })
    test: Dataset({
        features: ['image', 'label', 'city'],
        num_rows: 15000
    })
})


In [None]:
# Push dataset to hub
print(f"Pushing dataset to {dataset_repo_id}")
train_test_dataset.push_to_hub(dataset_repo_id)

print("Pushed dataset to:", dataset_repo_id)

Pushing dataset to SABR22/Canadian-streetview-cities


Map: 100%|██████████| 5000/5000 [00:11<00:00, 428.14 examples/s]shards/s]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:01<00:00,  2.74ba/s]
Processing Files (1 / 1): 100%|██████████|  485MB /  485MB,  341kB/s  
New Data Upload: 100%|██████████|  485MB /  485MB,  341kB/s  
Map: 100%|██████████| 5000/5000 [00:11<00:00, 444.36 examples/s]14, 512.87s/ shards]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:01<00:00,  2.81ba/s]
Processing Files (1 / 1): 100%|██████████|  486MB /  486MB,  190kB/s  
New Data Upload: 100%|██████████|  486MB /  486MB,  190kB/s  
Map: 100%|██████████| 5000/5000 [00:11<00:00, 441.92 examples/s]22, 492.89s/ shards]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:01<00:00,  2.89ba/s]
Processing Files (1 / 1): 100%|██████████|  487MB /  487MB,  147kB/s  
New Data Upload: 100%|██████████|  487MB /  487MB,  147kB/s  
Map: 100%|██████████| 5000/5000 [00:12<00:00, 393.56 examples/s]18, 468.25s/ shards]
Creating parquet from Arro

Pushed dataset to: SABR22/Canadian-streetview-cities


: 