In [2]:
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm
import shutil

In [3]:


# Paths
DATA_DIR = "train" 
TRAIN_CSV = "train.csv"
OUTPUT_DIR = "organized_dataset/train"

# Step 1: Load Metadata
train_metadata = pd.read_csv(TRAIN_CSV)

# Step 2: Collect Image IDs Using `os.walk`
downloaded_image_ids = []
for root, _, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".jpg"):
            image_id = file.split(".")[0]  # Get image ID (without .jpg)
            downloaded_image_ids.append(image_id)

print(f"Found {len(downloaded_image_ids)} images in '{DATA_DIR}'.")

# Step 3: Filter Metadata for Downloaded Images
filtered_metadata = train_metadata[train_metadata['id'].isin(downloaded_image_ids)]

# Step 4: Count Classes (Landmark IDs)
landmark_counts = Counter(filtered_metadata['landmark_id'])
print(f"Number of unique landmarks in the subset: {len(landmark_counts)}")

# Step 5: Select Top Classes (Optional)
TOP_N = 15
top_landmarks = [landmark for landmark, count in landmark_counts.most_common(TOP_N)]

# Filter metadata to include only top landmarks
reduced_metadata = filtered_metadata[filtered_metadata['landmark_id'].isin(top_landmarks)]

print(f"Selected {len(top_landmarks)} top landmarks for organization.")

# Step 6: Organize Images into Subdirectories by Landmark ID
print(f"Organizing images into {OUTPUT_DIR}...")

# Create output directories
for landmark_id in top_landmarks:
    os.makedirs(os.path.join(OUTPUT_DIR, str(landmark_id)), exist_ok=True)

# Move images into corresponding landmark directories
missing_files = []  # Track files that could not be found
for _, row in tqdm(reduced_metadata.iterrows(), total=len(reduced_metadata)):
    image_id = row['id']
    landmark_id = row['landmark_id']
    
    # Construct the source path dynamically
    src_path = None
    for root, _, files in os.walk(DATA_DIR):
        for file in files:
            if file.split(".")[0] == image_id:
                src_path = os.path.join(root, file)
                break
        if src_path:  # Break outer loop if file is found
            break

    if not src_path or not os.path.exists(src_path):  # Ensure the file exists before moving
        missing_files.append(image_id)
        continue

    dest_path = os.path.join(OUTPUT_DIR, str(landmark_id), os.path.basename(src_path))
    shutil.copy(src_path, dest_path)

print(f"Dataset preparation complete! Missing {len(missing_files)} files.")
if missing_files:
    print("Some files were not found:")
    for missing_file in missing_files[:10]:  # Show only first 10 missing files
        print(missing_file)


Found 413300 images in 'train'.
Number of unique landmarks in the subset: 117477
Selected 15 top landmarks for organization.
Organizing images into organized_dataset/train...


100%|██████████| 4379/4379 [06:33<00:00, 11.12it/s]

Dataset preparation complete! Missing 0 files.





In [3]:


# Paths
DATA_DIR = "test" 
TEST_CSV = "test.csv"
OUTPUT_DIR = "organized_dataset/test"

# Step 1: Load Metadata
train_metadata = pd.read_csv(TEST_CSV)

# Step 2: Collect Image IDs Using `os.walk`
downloaded_image_ids = []
for root, _, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".jpg"):
            image_id = file.split(".")[0]  # Get image ID (without .jpg)
            downloaded_image_ids.append(image_id)

print(f"Found {len(downloaded_image_ids)} images in '{DATA_DIR}'.")

# Step 3: Filter Metadata for Downloaded Images
filtered_metadata = train_metadata[train_metadata['id'].isin(downloaded_image_ids)]

# Step 4: Count Classes (Landmark IDs)
landmark_counts = Counter(filtered_metadata['landmark_id'])
print(f"Number of unique landmarks in the subset: {len(landmark_counts)}")

# Step 5: Select Top Classes (Optional)
TOP_N = 15
top_landmarks = [landmark for landmark, count in landmark_counts.most_common(TOP_N)]

# Filter metadata to include only top landmarks
reduced_metadata = filtered_metadata[filtered_metadata['landmark_id'].isin(top_landmarks)]

print(f"Selected {len(top_landmarks)} top landmarks for organization.")

# Step 6: Organize Images into Subdirectories by Landmark ID
print(f"Organizing images into {OUTPUT_DIR}...")

# Create output directories
for landmark_id in top_landmarks:
    os.makedirs(os.path.join(OUTPUT_DIR, str(landmark_id)), exist_ok=True)

# Move images into corresponding landmark directories
missing_files = []  # Track files that could not be found
for _, row in tqdm(reduced_metadata.iterrows(), total=len(reduced_metadata)):
    image_id = row['id']
    landmark_id = row['landmark_id']
    
    # Construct the source path dynamically
    src_path = None
    for root, _, files in os.walk(DATA_DIR):
        for file in files:
            if file.split(".")[0] == image_id:
                src_path = os.path.join(root, file)
                break
        if src_path:  # Break outer loop if file is found
            break

    if not src_path or not os.path.exists(src_path):  # Ensure the file exists before moving
        missing_files.append(image_id)
        continue

    dest_path = os.path.join(OUTPUT_DIR, str(landmark_id), os.path.basename(src_path))
    shutil.copy(src_path, dest_path)

print(f"Dataset preparation complete! Missing {len(missing_files)} files.")
if missing_files:
    print("Some files were not found:")
    for missing_file in missing_files[:10]:  # Show only first 10 missing files
        print(missing_file)


Found 5879 images in 'test'.


KeyError: 'landmark_id'