In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/download/src/utils.py
/kaggle/input/download/src/example.ipynb
/kaggle/input/ml-challenge/sample_test.csv
/kaggle/input/ml-challenge/sample_test_out.csv
/kaggle/input/ml-challenge/train.csv
/kaggle/input/ml-challenge/test.csv


In [2]:
!rm -rf /kaggle/working/*


In [3]:
import os
from pathlib import Path
import pandas as pd
import multiprocessing
from functools import partial
from tqdm import tqdm
import urllib
import zipfile

# -------------------------------
# IMAGE DOWNLOAD FUNCTION
# -------------------------------
def download_image_with_id(row, savefolder):
    """Download image and save with sample_id as filename"""
    sample_id, image_link = row
    if not isinstance(image_link, str) or not image_link.strip():
        return f"Invalid link: {sample_id}"
    
    try:
        extension = Path(image_link).suffix
        if not extension:
            extension = '.jpg'
        
        save_path = Path(savefolder) / f"{sample_id}{extension}"
        if save_path.exists():
            return f"Exists: {sample_id}"
        
        try:
            urllib.request.urlretrieve(image_link, save_path)
            return f"Downloaded: {sample_id}"
        except Exception as ex:
            return f"Failed: {sample_id} - {ex}"
    
    except Exception as ex:
        return f"Error: {sample_id} - {ex}"

# -------------------------------
# DOWNLOAD IMAGES IN BATCH
# -------------------------------
def download_images_in_batches(df, base_folder, dataset_name, batch_size=5000, num_workers=50):
    base_folder = Path(base_folder)
    dataset_folder = base_folder / dataset_name
    dataset_folder.mkdir(parents=True, exist_ok=True)

    total = len(df)
    all_results = []

    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        batch_df = df.iloc[start:end]
        batch_folder = dataset_folder / f"batch_{start}_{end-1}"
        batch_folder.mkdir(parents=True, exist_ok=True)

        print(f"\nDownloading {dataset_name} images: batch {start}-{end-1} ...")
        image_data = list(zip(batch_df['sample_id'], batch_df['image_link']))
        download_partial = partial(download_image_with_id, savefolder=batch_folder)

        results = []
        with multiprocessing.Pool(num_workers) as pool:
            for res in tqdm(pool.imap(download_partial, image_data), total=len(image_data)):
                results.append(res)
            pool.close()
            pool.join()
        all_results.extend(results)

        # Zip the batch
        zip_path = dataset_folder / f"{dataset_name}_batch_{start}_{end-1}.zip"
        print(f"Zipping batch {start}-{end-1} into {zip_path} ...")
        with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
            for file in batch_folder.iterdir():
                zipf.write(file, arcname=file.name)

        # Clean up batch folder
        for file in batch_folder.iterdir():
            file.unlink()
        batch_folder.rmdir()
        print(f"Batch {start}-{end-1} zipped and cleaned up.")

    return all_results

# -------------------------------
# MAIN DOWNLOAD FUNCTION (TEST ONLY)
# -------------------------------
def download_test_only(dataset_folder='/kaggle/input/ml-challenge', output_folder='/kaggle/working/images'):
    dataset_folder = Path(dataset_folder)
    output_folder = Path(output_folder)
    results = {}

    def load_csv(name):
        try:
            df = pd.read_csv(dataset_folder / f"{name}.csv")
            print(f"Loaded {name} dataset: {len(df)} samples")
            return df
        except FileNotFoundError:
            print(f"{name}.csv not found, skipping...")
            return None

    test_df = load_csv('test')

    # Test only
    if test_df is not None:
        results['test'] = download_images_in_batches(
            test_df, output_folder, 'test', batch_size=5000, num_workers=50
        )

    return results

# -------------------------------
# CHECK DOWNLOAD STATUS
# -------------------------------
def check_download_status(folder_path):
    folder_path = Path(folder_path)
    if folder_path.exists():
        zip_files = list(folder_path.glob("*.zip"))
        print(f"Found {len(zip_files)} zip files in {folder_path}")
        return len(zip_files)
    else:
        print(f"Folder {folder_path} does not exist")
        return 0

# -------------------------------
# MAIN SCRIPT
# -------------------------------
if __name__ == "__main__":
    print("Starting download of TEST dataset with multiprocessing and batch zipping...")

    results = download_test_only()

    print("\n" + "="*50)
    print("DOWNLOAD STATUS CHECK")
    print("="*50)

    check_download_status('/kaggle/working/images/test')

    print("Test dataset download process completed!")


Starting download of TEST dataset with multiprocessing and batch zipping...
Loaded test dataset: 75000 samples

Downloading test images: batch 0-4999 ...


100%|██████████| 5000/5000 [00:21<00:00, 236.51it/s]


Zipping batch 0-4999 into /kaggle/working/images/test/test_batch_0_4999.zip ...
Batch 0-4999 zipped and cleaned up.

Downloading test images: batch 5000-9999 ...


100%|██████████| 5000/5000 [00:22<00:00, 219.21it/s]


Zipping batch 5000-9999 into /kaggle/working/images/test/test_batch_5000_9999.zip ...
Batch 5000-9999 zipped and cleaned up.

Downloading test images: batch 10000-14999 ...


100%|██████████| 5000/5000 [00:22<00:00, 226.64it/s]


Zipping batch 10000-14999 into /kaggle/working/images/test/test_batch_10000_14999.zip ...
Batch 10000-14999 zipped and cleaned up.

Downloading test images: batch 15000-19999 ...


100%|██████████| 5000/5000 [00:24<00:00, 207.60it/s]


Zipping batch 15000-19999 into /kaggle/working/images/test/test_batch_15000_19999.zip ...
Batch 15000-19999 zipped and cleaned up.

Downloading test images: batch 20000-24999 ...


100%|██████████| 5000/5000 [00:20<00:00, 243.76it/s]


Zipping batch 20000-24999 into /kaggle/working/images/test/test_batch_20000_24999.zip ...
Batch 20000-24999 zipped and cleaned up.

Downloading test images: batch 25000-29999 ...


100%|██████████| 5000/5000 [00:20<00:00, 243.06it/s]


Zipping batch 25000-29999 into /kaggle/working/images/test/test_batch_25000_29999.zip ...
Batch 25000-29999 zipped and cleaned up.

Downloading test images: batch 30000-34999 ...


100%|██████████| 5000/5000 [00:19<00:00, 253.44it/s]


Zipping batch 30000-34999 into /kaggle/working/images/test/test_batch_30000_34999.zip ...
Batch 30000-34999 zipped and cleaned up.

Downloading test images: batch 35000-39999 ...


100%|██████████| 5000/5000 [00:21<00:00, 235.27it/s]


Zipping batch 35000-39999 into /kaggle/working/images/test/test_batch_35000_39999.zip ...
Batch 35000-39999 zipped and cleaned up.

Downloading test images: batch 40000-44999 ...


100%|██████████| 5000/5000 [00:34<00:00, 145.34it/s]


Zipping batch 40000-44999 into /kaggle/working/images/test/test_batch_40000_44999.zip ...
Batch 40000-44999 zipped and cleaned up.

Downloading test images: batch 45000-49999 ...


100%|██████████| 5000/5000 [00:23<00:00, 214.18it/s]


Zipping batch 45000-49999 into /kaggle/working/images/test/test_batch_45000_49999.zip ...
Batch 45000-49999 zipped and cleaned up.

Downloading test images: batch 50000-54999 ...


100%|██████████| 5000/5000 [00:21<00:00, 231.82it/s]


Zipping batch 50000-54999 into /kaggle/working/images/test/test_batch_50000_54999.zip ...
Batch 50000-54999 zipped and cleaned up.

Downloading test images: batch 55000-59999 ...


100%|██████████| 5000/5000 [00:21<00:00, 228.58it/s]


Zipping batch 55000-59999 into /kaggle/working/images/test/test_batch_55000_59999.zip ...
Batch 55000-59999 zipped and cleaned up.

Downloading test images: batch 60000-64999 ...


100%|██████████| 5000/5000 [00:21<00:00, 236.27it/s]


Zipping batch 60000-64999 into /kaggle/working/images/test/test_batch_60000_64999.zip ...
Batch 60000-64999 zipped and cleaned up.

Downloading test images: batch 65000-69999 ...


100%|██████████| 5000/5000 [00:20<00:00, 244.17it/s]


Zipping batch 65000-69999 into /kaggle/working/images/test/test_batch_65000_69999.zip ...
Batch 65000-69999 zipped and cleaned up.

Downloading test images: batch 70000-74999 ...


100%|██████████| 5000/5000 [00:21<00:00, 236.87it/s]


Zipping batch 70000-74999 into /kaggle/working/images/test/test_batch_70000_74999.zip ...
Batch 70000-74999 zipped and cleaned up.

DOWNLOAD STATUS CHECK
Found 15 zip files in /kaggle/working/images/test
Test dataset download process completed!


In [4]:
!pip install kaggle --upgrade




In [5]:
!mkdir -p ~/.kaggle
!cp /kaggle/input/token/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [19]:
import json

metadata = {
    "title": "My Merged Image Dataset",
    "id": "artorias12/ml-challenge-test-2025",
    "licenses": [{"name": "CC0-1.0"}]
}
with open('dataset-metadata.json', 'w') as f:
    json.dump(metadata, f)


In [20]:
!mv /kaggle/working/images/test/*.zip /kaggle/working/


mv: cannot stat '/kaggle/working/images/test/*.zip': No such file or directory


In [21]:
!kaggle datasets create -p /kaggle/working


Starting upload for file test_batch_20000_24999.zip
100%|███████████████████████████████████████| 1.10G/1.10G [00:06<00:00, 178MB/s]
Upload successful: test_batch_20000_24999.zip (1GB)
Starting upload for file test_batch_60000_64999.zip
100%|███████████████████████████████████████| 1.09G/1.09G [00:07<00:00, 161MB/s]
Upload successful: test_batch_60000_64999.zip (1GB)
Starting upload for file test_batch_70000_74999.zip
100%|███████████████████████████████████████| 1.10G/1.10G [00:05<00:00, 211MB/s]
Upload successful: test_batch_70000_74999.zip (1GB)
Starting upload for file test_batch_15000_19999.zip
100%|███████████████████████████████████████| 1.08G/1.08G [00:05<00:00, 193MB/s]
Upload successful: test_batch_15000_19999.zip (1GB)
Starting upload for file test_batch_45000_49999.zip
100%|███████████████████████████████████████| 1.10G/1.10G [00:09<00:00, 128MB/s]
Upload successful: test_batch_45000_49999.zip (1GB)
Starting upload for file test_batch_5000_9999.zip
100%|████████████████████

In [22]:
!kaggle datasets list --user artorias12


ref                                      title                                size  lastUpdated                 downloadCount  voteCount  usabilityRating  
---------------------------------------  ----------------------------  -----------  --------------------------  -------------  ---------  ---------------  
artorias12/decoration                    Decoration                         345353  2025-08-29 14:08:02.803000              0          0  0.11764706       
artorias12/testing                       testing                             41561  2025-08-29 14:12:14.943000              0          0  0.125            
artorias12/llamaindex                    LlamaIndex                          86921  2025-01-16 20:04:46.697000              0          0  0.0              
artorias12/the-verdict                   The-verdict                          9069  2024-08-19 15:39:37.837000              0          0  0.25             
artorias12/environment                   environment            