In [1]:
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm
import requests
from urllib.parse import urlparse

In [5]:
# Paths
TRAIN_CSV = "train.csv"
OUTPUT_DIR = "organized_dataset/train"

# Step 1: Load Metadata
train_metadata = pd.read_csv(TRAIN_CSV)

# Step 2: Count Images per Landmark and Select Top 15 Landmarks
landmark_counts = train_metadata['landmark_id'].value_counts()
top_landmarks = landmark_counts.index[:15]

# Create the output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [25]:
def download_image(url, filepath):
    headers = {'User-Agent': 'CankutBot/0.1 (cnkt.er@gmail.com)'}
    """Download an image from a URL to the specified filepath."""
    try:
        response = requests.get(url, headers=headers,timeout=100)
        response.raise_for_status()
        with open(filepath, "wb") as f:
            f.write(response.content)
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

In [26]:
# Process Top 15 Landmarks
print("Downloading images...")
for landmark_id in tqdm(top_landmarks):
    # Filter rows for the current landmark
    landmark_images = train_metadata[train_metadata['landmark_id'] == landmark_id]
    
    # Randomly sample 250 images
    sampled_images = landmark_images.sample(n=250, random_state=42)
    
    # Create directory for the landmark
    landmark_dir = os.path.join(OUTPUT_DIR, str(landmark_id))
    os.makedirs(landmark_dir, exist_ok=True)
    
    # Download sampled images
    for _, row in sampled_images.iterrows():
        image_id = row['id']
        url = row['url']
        parsed_url = urlparse(url)
        filename = os.path.basename(parsed_url.path)
        if not filename:
            filename = f"{image_id}.jpg"
        
        filepath = os.path.join(landmark_dir, filename)
        if not os.path.exists(filepath):
            download_image(url, filepath)

print("Image downloading complete.")

Downloading images...


  0%|          | 0/15 [00:00<?, ?it/s]

Failed to download https://upload.wikimedia.org/wikipedia/commons/9/90/ETH-BIB-F%C3%A8s%2C_Pont_Rsif-Dia_247-04915.tif: HTTPSConnectionPool(host='upload.wikimedia.org', port=443): Read timed out.


  0%|          | 0/15 [04:30<?, ?it/s]


KeyboardInterrupt: 