In [1]:
import os
import requests
import csv
import pandas as pd
import torch
from ultralytics import YOLO
import supervision as sv
import cv2
import numpy as np

# MODEL_PATH = '../00_model_checkpoints/yolov10x.pt'
MODEL_PATH = '/mnt/damian/Projects/car_image_processor/postprocessing/yolo11x.pt'

category_dict = {
    0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
    6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
    11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
    16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
    22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
    27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
    32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
    36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
    40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
    46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
    51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake',
    56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table',
    61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
    67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
    72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
    77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
}

class YOLO_helper:
    def __init__(self, device="cuda"):
        # torch.serialization.add_safe_globals([YOLO])
        # torch.serialization.safe_globals([YOLO])
        self.model = YOLO(MODEL_PATH).to(device)
        self.model.eval()

    def label_single_image(self, pil_img):
        cv2_image = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) 
        # scale image to 640 on the longest side
        scale = 640 / max(cv2_image.shape[:2])
        cv2_image = cv2.resize(cv2_image, (0, 0), fx=scale, fy=scale)
        results = self.model(source=cv2_image, conf=0.25, verbose=False)[0]
        detections = sv.Detections.from_ultralytics(results)
        labels = [
            f"{category_dict[class_id]}:{confidence:.2f}"
            for class_id, confidence in zip(detections.class_id, detections.confidence)
        ]
        return labels

    def label_batch_images(self, pil_images, single_label_per_image=False):
        """
        Label a batch of images using the YOLO model.
        Args:
            pil_images (list of PIL.Image): List of images to label.
            single_label_per_image (bool): If True, return only the first label for each image.
        Returns:
            list of str: List of labels for each image.
        """
        cv2_images = [cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) for pil_img in pil_images]
        results = self.model(source=cv2_images, conf=0.25, verbose=False)
        labels = []
        for result in results:
            detections = sv.Detections.from_ultralytics(result)
            if single_label_per_image and len(detections.class_id) > 0:
                if detections.confidence[0] < 0.89:
                    internal_labels = " "
                else:
                    internal_labels = f"{category_dict[detections.class_id[0]]}"
            else:
                labels_single_image = [
                    f"{category_dict[class_id]}:{confidence:.2f}"
                    for class_id, confidence in zip(detections.class_id, detections.confidence)
                    ]
                internal_labels = ""
                for label in labels_single_image:
                    if len(internal_labels) > 0:
                        internal_labels += ", "
                    internal_labels += label #.split(":")[0]
                if len(internal_labels) == 0:
                    internal_labels = " "
            labels.append(internal_labels)
        return labels

  import pynvml  # type: ignore[import]


In [2]:
import shutil
import uuid
import tempfile
from PIL import Image
from rembg import remove
import io
import time
import json
import random

def download_image(url, save_path=None, max_retries=3):
    """Download an image from a URL and return as PIL Image with retry logic"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.autoevolution.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0',
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=15)
            if response.status_code == 200:
                img = Image.open(io.BytesIO(response.content))
                if save_path:
                    img.save(save_path)
                return img
            elif response.status_code == 403:
                print(f"Access forbidden (403) for {url}, retry {attempt+1}/{max_retries}")
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                print(f"Failed to download image, status code: {response.status_code}")
                break  # Don't retry for non-403 errors
        except Exception as e:
            print(f"Error downloading image from {url}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
    return None

def remove_background_and_rescale(img, target_size=512):
    """Remove background and rescale image to fit within target_size without distortion"""
    # Remove background
    img_no_bg = remove(img)
    
    # Get alpha channel for object dimensions
    alpha = np.array(img_no_bg.getchannel('A'))
    # Find object bounding box
    coords = np.argwhere(alpha > 0)
    if len(coords) == 0:  # No foreground object found
        return None
        
    y_min, x_min = coords.min(axis=0)
    y_max, x_max = coords.max(axis=0)
    
    # Crop to object bounding box
    cropped = img_no_bg.crop((x_min, y_min, x_max, y_max))
    
    # Calculate scaling factor to fit within target size
    width, height = cropped.size
    scale_factor = min(target_size / width, target_size / height)
    
    # Resize while maintaining aspect ratio
    new_width = int(width * scale_factor)
    new_height = int(height * scale_factor)
    resized = cropped.resize((new_width, new_height), Image.LANCZOS)
    
    # Create new image with white background of target size
    final_img = Image.new('RGBA', (target_size, target_size), (0, 0, 0, 0))
    
    # Paste resized image in center
    paste_x = (target_size - new_width) // 2
    paste_y = (target_size - new_height) // 2
    final_img.paste(resized, (paste_x, paste_y), resized)
    
    return final_img

def process_image_batch(image_urls, metadata_rows, yolo_helper, temp_dir, output_dir, metadata_file, gpu_batch_size=64, metadata_row_file_limit=25):
    """Process all available images from each URL list, using batching for YOLO inference"""
    os.makedirs(temp_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Download all available images from all URL lists
    all_images = []  # List to store all downloaded images
    all_temp_paths = []  # List to store all temporary file paths
    all_metadata_indices = []  # List to store metadata row indices for each image
    all_original_urls = []  # List to store the original URL for each image
    
    print(f"Starting to download images from {len(image_urls)} metadata rows")
    for i, (url_list, metadata) in enumerate(zip(image_urls, metadata_rows)):
        urls = [url.strip() for url in url_list.split(',')]
        random.shuffle(urls)  # Randomize to distribute load
        
        # Try to download each image in the URL list
        successful_downloads = 0
        
        for j, url in enumerate(urls):
            if successful_downloads >= metadata_row_file_limit:  # Limit to 25 successful downloads per metadata row
                break
                
            temp_path = os.path.join(temp_dir, f"temp_{i}_{j}_{uuid.uuid4().hex[:6]}.jpg")
            img = download_image(url, temp_path)
            
            if img is not None:
                all_images.append(img)
                all_temp_paths.append(temp_path)
                all_metadata_indices.append(i)
                all_original_urls.append(url)
                successful_downloads += 1
            else:
                # Remove failed temp file if it exists
                if os.path.exists(temp_path):
                    os.remove(temp_path)
            
            # Small delay between downloads from the same metadata row
            time.sleep(0.5)
        
        # Report progress for this metadata row
        if successful_downloads > 0:
            print(f"Row {i}: Downloaded {successful_downloads} images successfully")
        else:
            print(f"Row {i}: Failed to download any images")
            
        # Add a small delay between processing different metadata rows
        time.sleep(1)
    
    if not all_images:
        print("No images were successfully downloaded in this batch")
        return
    
    print(f"Successfully downloaded {len(all_images)} images in total")
    
    # Step 2: Process downloaded images in batches with YOLO
    car_metadata = []  # List to store metadata for car images
    
    # Process images in batches to avoid GPU memory issues
    for batch_start in range(0, len(all_images), gpu_batch_size):
        batch_end = min(batch_start + gpu_batch_size, len(all_images))
        batch_images = all_images[batch_start:batch_end]
        batch_indices = all_metadata_indices[batch_start:batch_end]
        batch_urls = all_original_urls[batch_start:batch_end]
        batch_paths = all_temp_paths[batch_start:batch_end]
        
        print(f"Processing YOLO batch {batch_start//gpu_batch_size + 1}/{(len(all_images)+gpu_batch_size-1)//gpu_batch_size} with {len(batch_images)} images")
        
        # Detect cars in this batch
        labels = yolo_helper.label_batch_images(batch_images, single_label_per_image=True)
        
        # Process each image in the batch based on its label
        for i, (img, label, metadata_idx, url, temp_path) in enumerate(zip(batch_images, labels, batch_indices, batch_urls, batch_paths)):
            if 'car' in label.lower() or 'truck' in label.lower():
                print(f"Processing image with label: {label}")
                # Remove background and rescale
                processed_img = remove_background_and_rescale(img)
                if processed_img is None:
                    print("  Failed to remove background or no foreground detected")
                    continue
                    
                # Generate unique filename
                unique_id = str(uuid.uuid4())[:8]
                brand = metadata_rows[metadata_idx]['brand'].replace(' ', '_')
                model = metadata_rows[metadata_idx]['model'].replace(' ', '_')
                year = metadata_rows[metadata_idx]['from_year']
                
                filename = f"{brand}_{model}_{year}_{unique_id}.png"
                output_path = os.path.join(output_dir, filename)
                
                # Save processed image
                processed_img.save(output_path)
                
                # Add metadata
                metadata_entry = metadata_rows[metadata_idx].copy()
                metadata_entry['original_url'] = url
                metadata_entry['saved_filename'] = filename
                metadata_entry['detection_label'] = label
                car_metadata.append(metadata_entry)
                
                print(f"Saved car image: {filename}")
            else:
                print(f"Skipping image with label: {label}")
        
        # Clean up temporary files for this batch
        for path in batch_paths:
            if os.path.exists(path):
                os.remove(path)
    
    # Step 3: Save metadata to CSV
    if car_metadata:
        with open(metadata_file, 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=car_metadata[0].keys())
            if os.stat(metadata_file).st_size == 0:  # File is empty, write header
                writer.writeheader()
            writer.writerows(car_metadata)
        print(f"Saved metadata for {len(car_metadata)} car images")
    else:
        print("No car images were found in the downloaded images")
    
    print(f"Batch processing complete. Processed {len(all_images)} images, found {len(car_metadata)} cars.")

In [3]:
# Load data (assuming data is already loaded from previous cell)
if 'data' not in globals():
    data = pd.read_csv('./output/data_full.csv')

# Select required fields
fields = ['brand', 'model', 'from_year', 'to_year', 'body_style', 'segment', 'title', 'description', 'model_url']
metadata_fields = {field: data[field].tolist() for field in fields if field in data.columns}

# Create list of metadata dictionaries
metadata_rows = []
for i in range(len(data)):
    row = {field: metadata_fields[field][i] for field in fields if field in metadata_fields}
    metadata_rows.append(row)

# Extract image URLs
image_urls = data['image_urls'].tolist()

# Initialize YOLO helper
yolo_helper = YOLO_helper()

# Setup directories
temp_dir = './temp_downloads'
output_dir = './car_images'
metadata_file = './car_images_metadata.csv'

os.makedirs(temp_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Create empty metadata file
if not os.path.exists(metadata_file):
    with open(metadata_file, 'w', newline='') as f:
        pass  # Create empty file

# Define batch sizes
index_batch_size = 5  # Number of metadata rows to process at once
gpu_batch_size = 32   # Number of images to process with YOLO at once
total_images = len(image_urls)

# Process the first batch as a test
test_batch = True

for start_idx in range(0, total_images, index_batch_size):
    end_idx = min(start_idx + index_batch_size, total_images)
    print(f"Processing metadata batch {start_idx//index_batch_size + 1}/{(total_images+index_batch_size-1)//index_batch_size} (rows {start_idx}-{end_idx-1})")
    
    batch_urls = image_urls[start_idx:end_idx]
    batch_metadata = metadata_rows[start_idx:end_idx]
    
    process_image_batch(batch_urls, batch_metadata, yolo_helper, temp_dir, output_dir, metadata_file, gpu_batch_size)
    
    if test_batch:
        print("Test batch completed. Set test_batch = False to process all images.")
        break
    
    # Add a delay between batches to avoid overwhelming servers
    time.sleep(5)

# Clean up temp directory when done
if os.path.exists(temp_dir):
    shutil.rmtree(temp_dir)

print(f"Processing complete. Car images saved to {output_dir} with metadata in {metadata_file}")

Processing metadata batch 1/763 (rows 0-4)
Starting to download images from 5 metadata rows
Row 0: Downloaded 24 images successfully
Row 1: Downloaded 25 images successfully
Row 2: Downloaded 25 images successfully
Row 3: Downloaded 25 images successfully
Row 4: Downloaded 25 images successfully
Successfully downloaded 124 images in total
Processing YOLO batch 1/4 with 32 images
Skipping image with label:  
Skipping image with label:  
Processing image with label: car
Saved car image: AC_AC__428_Convertible_1966_b0ea1119.png
Skipping image with label:  
Processing image with label: truck
Saved car image: AC_AC__428_Convertible_1966_528631a6.png
Skipping image with label:  
Skipping image with label:  
Processing image with label: car
Saved car image: AC_AC__428_Convertible_1966_bba604d6.png
Skipping image with label:  
Skipping image with label:  
Skipping image with label: motorcycle
Skipping image with label:  
Skipping image with label:  
Skipping image with label:  
Processing imag

KeyboardInterrupt: 

In [None]:
# Set to False to process all remaining images
test_batch = False

if not test_batch:
    # Continue processing from where we left off
    # First, check how many images we've already processed
    processed_count = 0
    processed_indexes = set()
    
    if os.path.exists(metadata_file) and os.path.getsize(metadata_file) > 0:
        try:
            # Load the metadata to see what we've already processed
            processed_df = pd.read_csv(metadata_file)
            # Extract original URLs to identify which rows were processed
            for url in processed_df['original_url']:
                # Find which row contains this URL
                for idx, url_list in enumerate(image_urls):
                    if url in url_list:
                        processed_indexes.add(idx)
            processed_count = len(processed_indexes)
            print(f"Found {processed_count} already processed items")
        except Exception as e:
            print(f"Error reading metadata file: {e}")
            processed_count = 0
    
    # Process in batches, skipping already processed ones
    index_batch_size = 10   # Keep small batch size for metadata rows
    gpu_batch_size = 64    # GPU batch size for YOLO processing
    
    # Create a list of indices to process (skip already processed ones)
    indices_to_process = [i for i in range(total_images) if i not in processed_indexes]
    print(f"Processing {len(indices_to_process)} remaining items")
    
    # Process in random order to distribute across different model types
    random.shuffle(indices_to_process)
    
    for batch_start in range(0, len(indices_to_process), index_batch_size):
        batch_indices = indices_to_process[batch_start:batch_start+index_batch_size]
        print(f"Processing batch {batch_start//index_batch_size + 1}/{(len(indices_to_process)+index_batch_size-1)//index_batch_size}")
        
        batch_urls = [image_urls[i] for i in batch_indices]
        batch_metadata = [metadata_rows[i] for i in batch_indices]
        
        process_image_batch(batch_urls, batch_metadata, yolo_helper, temp_dir, output_dir, metadata_file, gpu_batch_size)
        
        # Add a delay between batches to avoid overwhelming servers
        time.sleep(5)
    
    # Clean up temp directory when done
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    
    print(f"Processing complete. Car images saved to {output_dir} with metadata in {metadata_file}")
else:
    print("Set test_batch = False to process all remaining images.")

In [None]:
# Check results
if os.path.exists(metadata_file):
    car_metadata_df = pd.read_csv(metadata_file)
    print(f"Total car images saved: {len(car_metadata_df)}")
    print("\nSample metadata:")
    display(car_metadata_df.head())
    
    # Display a few sample images
    if len(car_metadata_df) > 0:
        from IPython.display import display, Image as IPImage
        print("\nSample images:")
        sample_files = car_metadata_df['saved_filename'].sample(min(5, len(car_metadata_df))).tolist()
        for file in sample_files:
            img_path = os.path.join(output_dir, file)
            if os.path.exists(img_path):
                display(IPImage(filename=img_path, width=300))
else:
    print("No metadata file found. Run the processing cells first.")