<a href="https://colab.research.google.com/github/Anubhab0410/Generative-AI-Sketch-to-Real-Interior-Designer/blob/main/IKEA_Dataset_Engineering_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import glob

# Set your path to the downloaded repo
dataset_path = 'ikea/images' # Adjust based on where you unzipped it
output_photos = 'dataset/photos'
output_sketches = 'dataset/sketches'

os.makedirs(output_photos, exist_ok=True)
os.makedirs(output_sketches, exist_ok=True)

In [None]:
import cv2
from PIL import Image

def process_ikea_dataset(input_folder, photo_folder, sketch_folder, size=(512, 512)):
    # Find all jpg/png files
    image_files = glob.glob(os.path.join(input_folder, '**/*.jpg'), recursive=True)

    print(f"Found {len(image_files)} images. Starting processing...")

    for i, img_path in enumerate(image_files):
        # 1. Load and Resize Photo
        img = cv2.imread(img_path)
        if img is None: continue
        img = cv2.resize(img, size)

        # 2. Generate Canny Sketch
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # We use a slight blur to prevent the sketch from being too 'noisy'
        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
        canny = cv2.Canny(blurred, 100, 200)

        # 3. Save both with matching names
        filename = f"ikea_{i:04d}.png"
        cv2.imwrite(os.path.join(photo_folder, filename), img)
        cv2.imwrite(os.path.join(sketch_folder, filename), canny)

        if i % 100 == 0:
            print(f"Processed {i} images...")

process_ikea_dataset(dataset_path, output_photos, output_sketches)

In [None]:
import matplotlib.pyplot as plt

# Check 3 random pairs
fig, axes = plt.subplots(3, 2, figsize=(10, 15))
sample_files = os.listdir(PHOTO_DIR)[:3]

for i, filename in enumerate(sample_files):
    photo = Image.open(os.path.join(PHOTO_DIR, filename))
    sketch = Image.open(os.path.join(SKETCH_DIR, filename))

    axes[i, 0].imshow(photo)
    axes[i, 0].set_title("Target Photo")
    axes[i, 1].imshow(sketch, cmap='gray')
    axes[i, 1].set_title("Conditioning Sketch (Canny)")

plt.tight_layout()
plt.show()

In [None]:
import os
import matplotlib.pyplot as plt
from PIL import Image

# Re-defining paths (Ensure these match your previous cell)
OUTPUT_DIR = '/content/interior_dataset'
PHOTO_DIR = os.path.join(OUTPUT_DIR, 'photos')
SKETCH_DIR = os.path.join(OUTPUT_DIR, 'sketches')

# Verify the directory exists before listing
if os.path.exists(PHOTO_DIR):
    # Check 3 random pairs
    fig, axes = plt.subplots(3, 2, figsize=(10, 15))
    sample_files = os.listdir(PHOTO_DIR)[:3]

    for i, filename in enumerate(sample_files):
        photo = Image.open(os.path.join(PHOTO_DIR, filename))
        sketch = Image.open(os.path.join(SKETCH_DIR, filename))

        axes[i, 0].imshow(photo)
        axes[i, 0].set_title(f"Target Photo: {filename}")
        axes[i, 1].imshow(sketch, cmap='gray')
        axes[i, 1].set_title(f"Conditioning Sketch: {filename}")

    plt.tight_layout()
    plt.show()
else:
    print(f"Error: {PHOTO_DIR} does not exist. Please run the processing pipeline cell first.")

In [None]:
!git clone https://github.com/IvonaTau/ikea.git

In [None]:
import os
import json

# 1. Re-define paths to ensure they exist in this cell scope
OUTPUT_DIR = '/content/interior_dataset'
PHOTO_DIR = os.path.join(OUTPUT_DIR, 'photos')
SKETCH_DIR = os.path.join(OUTPUT_DIR, 'sketches')
METADATA_FILE = os.path.join(OUTPUT_DIR, 'train.jsonl')

# 2. Generate the Metadata File
if os.path.exists(PHOTO_DIR):
    with open(METADATA_FILE, 'w') as f:
        # Get list of all processed photos
        filenames = sorted(os.listdir(PHOTO_DIR))

        for filename in filenames:
            # Create a dictionary for each image pair
            # This follows the 'diffusers' library training format
            entry = {
                "text": "a professional interior design of a room, IKEA style, high resolution",
                "image": f"photos/{filename}",
                "conditioning_image": f"sketches/{filename}"
            }
            # Write as a single line in the JSONL file
            f.write(json.dumps(entry) + '\n')

    print(f"✅ Success! Metadata created with {len(filenames)} entries.")
    print(f"Location: {METADATA_FILE}")
else:
    print("❌ Error: Processed folders not found. Please run the Processing Pipeline cell again.")

In [None]:
import cv2
import os
import glob
from tqdm import tqdm

# 1. MATCHING YOUR SIDEBAR: Input is '/content/ikea/images'
INPUT_DIR = '/content/ikea/images'
# Output will be created here
OUTPUT_DIR = '/content/interior_dataset'
PHOTO_DIR = os.path.join(OUTPUT_DIR, 'photos')
SKETCH_DIR = os.path.join(OUTPUT_DIR, 'sketches')

os.makedirs(PHOTO_DIR, exist_ok=True)
os.makedirs(SKETCH_DIR, exist_ok=True)

# 2. RUN THE PROCESSING
image_paths = glob.glob(os.path.join(INPUT_DIR, '**/*.jpg'), recursive=True)
print(f"Found {len(image_paths)} images. Starting...")

for i, img_path in enumerate(tqdm(image_paths[:100])): # Start with 100 to test
    img = cv2.imread(img_path)
    if img is None: continue

    img = cv2.resize(img, (512, 512))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)

    file_id = f"ikea_{i:04d}.png"
    cv2.imwrite(os.path.join(PHOTO_DIR, file_id), img)
    cv2.imwrite(os.path.join(SKETCH_DIR, file_id), edges)

print("\n✅ Done! Now try your visualization code again.")

In [None]:
import os
import json

# Define the paths one last time
OUTPUT_DIR = '/content/interior_dataset'
PHOTO_DIR = os.path.join(OUTPUT_DIR, 'photos')
METADATA_FILE = os.path.join(OUTPUT_DIR, 'train.jsonl')

if os.path.exists(PHOTO_DIR):
    filenames = sorted(os.listdir(PHOTO_DIR))
    with open(METADATA_FILE, 'w') as f:
        for filename in filenames:
            entry = {
                "text": "a professional interior design of a room, IKEA style, high resolution",
                "image": f"photos/{filename}",
                "conditioning_image": f"sketches/{filename}"
            }
            f.write(json.dumps(entry) + '\n')
    print(f"✅ Metadata generated with {len(filenames)} entries!")

In [None]:
# 1. Clear the previous 100-entry test if you want a fresh start
# !rm -rf /content/interior_dataset

# 2. Re-run for the full dataset
image_paths = glob.glob(os.path.join(INPUT_DIR, '**/*.jpg'), recursive=True)
print(f"Found {len(image_paths)} total images. Processing full dataset...")

for i, img_path in enumerate(tqdm(image_paths)): # Removed [:100]
    # Check if this index was already processed (Optional)
    file_id = f"ikea_{i:04d}.png"
    if os.path.exists(os.path.join(PHOTO_DIR, file_id)):
        continue

    img = cv2.imread(img_path)
    if img is None: continue

    img = cv2.resize(img, (512, 512))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)

    cv2.imwrite(os.path.join(PHOTO_DIR, file_id), img)
    cv2.imwrite(os.path.join(SKETCH_DIR, file_id), edges)

print(f"\n✅ Done! Processed all {len(os.listdir(PHOTO_DIR))} images.")

In [None]:
import os
import json

# Define the paths one last time
OUTPUT_DIR = '/content/interior_dataset'
PHOTO_DIR = os.path.join(OUTPUT_DIR, 'photos')
METADATA_FILE = os.path.join(OUTPUT_DIR, 'train.jsonl')

if os.path.exists(PHOTO_DIR):
    filenames = sorted(os.listdir(PHOTO_DIR))
    with open(METADATA_FILE, 'w') as f:
        for filename in filenames:
            entry = {
                "text": "a professional interior design of a room, IKEA style, high resolution",
                "image": f"photos/{filename}",
                "conditioning_image": f"sketches/{filename}"
            }
            f.write(json.dumps(entry) + '\n')
    print(f"✅ Metadata generated with {len(filenames)} entries!")

In [None]:
# Run this in a code cell to create a single zip file
!zip -r interior_design_dataset_A.zip /content/interior_dataset