The following Python script queries Wikidata for information about cars, downloads their images, and extracts metadata, including image size, format, orientation, creation date, and EXIF data. The script handles potential errors during image processing and saves the metadata for the downloaded images in a JSON file. The goal is to create a comprehensive dataset of car images with associated metadata for further analysis or use.

In [1]:

import os
from SPARQLWrapper import SPARQLWrapper, JSON
from PIL import Image
import json
import urllib
import time
import re
from PIL.ExifTags import TAGS

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT DISTINCT ?car ?carLabel ?image {
  ?car wdt:P31 wd:Q1420;
       wdt:P18 ?image.
 SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 10"""

def get_results(endpoint_url, query):
    user_agent = "me/1.0 (me@email.com)"
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def download_image_metadata(images_folder, data, limit=10):
    # Ensure the images folder exists
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)

    # List to accumulate metadata for all images
    all_metadata = []

    for i, result in enumerate(data["results"]["bindings"]):
        if i >= limit:
            break

        car_label = result.get("carLabel", {}).get("value", f"UnknownCar_{i + 1}")
        # Remove special characters from the car label for file path
        car_label_cleaned = re.sub(r'\W+', '', car_label)
        image_url = result.get("image", {}).get("value", "")

        image_name = f"{car_label_cleaned}_image_{i + 1}.jpg"
        image_path = os.path.join(images_folder, image_name)

        try:
            # Download the image from Wikidata
            urllib.request.urlretrieve(image_url, image_path)

            print(f"Downloaded: {image_name}")

            # Get metadata and accumulate in the list
            metadata = get_metadata(image_path)
            all_metadata.append(metadata)

            # Introduce a delay between requests to comply with rate limits
            time.sleep(1)

        except Exception as e:
            print(f"Error processing image {image_name}: {e}")

    # Save all metadata to a single JSON file
    json_file_path = os.path.join(json_folder,"metadata.json")
    with open(json_file_path, "w") as json_file:
        json.dump(all_metadata, json_file, indent=4)

    print(f"All metadata saved to: {json_file_path}")


def get_exif_metadata(exif_data):
    if exif_data:
        exif_metadata = {}
        for tag, value in exif_data.items():
            tag_name = TAGS.get(tag, tag)
            # Exclude problematic values
            if isinstance(value, bytes):
                continue
            # Convert non-serializable types to string
            if isinstance(value, (str, int, float)):
                exif_metadata[tag_name] = value
            else:
                exif_metadata[tag_name] = str(value)
        return exif_metadata
    return {}

def get_metadata(image_path):
    with Image.open(image_path) as img:
        exif_data = img._getexif()  # Get EXIF data
        metadata = {
            "image_name": os.path.basename(image_path),
            "image_size": img.size,
            "image_format": img.format,
            "image_orientation": get_image_orientation(exif_data),
            "creation_date": get_creation_date(exif_data),
            "exif_metadata": get_exif_metadata(exif_data),
        }
        return metadata

def get_image_orientation(exif_data):
    if exif_data:
        orientation = exif_data.get(274)  # 274 corresponds to the 'Orientation' tag
        if orientation is not None:
            return orientation
    return "Unknown"

def get_creation_date(exif_data):
    if exif_data:
        date_time_original = exif_data.get(36867)  # 36867 corresponds to 'DateTimeOriginal' tag
        if date_time_original is not None:
            return date_time_original
    return "Unknown"

if __name__ == "__main__":
    # Specify the folder paths
    images_folder = "*images*"
    json_folder = "*json*"

    # Get results from Wikidata
    results = get_results(endpoint_url, query)

    # Download images and save metadata
    download_image_metadata(images_folder, results, limit=10)



Downloaded: Kharkovchanka_image_1.jpg
Downloaded: Ferrari250GTBoanoEllenacar_image_2.jpg
Downloaded: Fiat770_image_3.jpg
Downloaded: CampagnaTRex_image_4.jpg
Downloaded: CG1200S_image_5.jpg
Downloaded: VAZ2104_image_6.jpg
Downloaded: Fiat1500Lcar_image_7.jpg
Downloaded: Fiat500Giardiniera_image_8.jpg
Downloaded: Ferrari250P_image_9.jpg
Downloaded: GeelyKingKong_image_10.jpg


FileNotFoundError: [Errno 2] No such file or directory: '*json*/metadata.json'

Now, we are going to use the following script to determine the 3 predominant colors in each image by the k-means clustering algorithm.
So, we will use the `k-means` algorithm to determine the 3 predominant colors in each image. The `k-means` algorithm is a popular unsupervised learning algorithm that clusters data points into `k` groups based on their features. In this case, we will use the RGB values of the pixels in the image to cluster them into 3 groups, which will represent the 3 predominant colors in the image. We will then save the predominant colors and their frequencies in a JSON file for further analysis or use.

In [3]:
import pandas as pd
from sklearn.cluster import KMeans
from PIL import Image
import os
import json
from collections import Counter

def rgb_to_color_name(rgb_tuple):
    # Define a list of common color names
    color_names = [
        "black", "white", "red", "green", "blue", "yellow",
        "orange", "brown", "pink", "purple", "gray"
    ]

    # Calculate the Euclidean distance to find the closest color
    distances = [sum((rgb[i] - rgb_tuple[i]) ** 2 for i in range(3)) for rgb in color_list]
    closest_color_index = distances.index(min(distances))

    return color_names[closest_color_index]

def extract_predominant_colors(image_path, num_colors=3):
    with Image.open(image_path) as img:
        # Convert image to numpy array
        image_array = img.convert("RGB")
        image_array = np.array(image_array)
        # Flatten the array to apply K-Means clustering
        pixels = image_array.reshape((-1, 3))

        # Use K-Means clustering to find predominant colors
        kmeans = KMeans(n_clusters=num_colors, random_state=42)
        kmeans.fit(pixels)
        # Get cluster centers representing predominant colors
        predominant_colors = kmeans.cluster_centers_.astype(int)

        # Convert RGB values to color names
        color_names = [rgb_to_color_name(tuple(color)) for color in predominant_colors]

        # Count the frequency of each predominant color
        color_counts = Counter(color_names)

        # Create a dictionary with color names and their frequencies
        result_dict = dict(color_counts)

        return result_dict

def process_images(images_folder, output_json_path, num_colors=3):
    # Get the list of image files in the specified folder
    image_files = [f for f in os.listdir(images_folder) if f.endswith(('.jpg', '.jpeg', '.png'))]

    # Dictionary to store predominant colors for each image
    predominant_colors_dict = {}

    for image_file in image_files:
        image_path = os.path.join(images_folder, image_file)
        # Extract predominant colors for each image
        predominant_colors = extract_predominant_colors(image_path, num_colors)
        # Add to the dictionary
        predominant_colors_dict[image_file] = predominant_colors

    # Create a DataFrame from the dictionary
    df = pd.DataFrame.from_dict(predominant_colors_dict, orient='index')

    # Save the DataFrame to a JSON file
    df.to_json(output_json_path, orient='index', indent=4)

if __name__ == "__main__":
    # Specify the folder path containing images
    images_folder = "*images*"

    # Specify the output JSON file path
    output_json_path = "./*json*/predominant_colors.json"

    # Specify the number of predominant colors to extract (default: 3)
    num_colors = 3

    # Process images and save the result to a JSON file
    process_images(images_folder, output_json_path, num_colors)


NameError: name 'color_list' is not defined

After that, we will ask the user to select some images and add tags. For every user, we are now ready to build a user-preference profile, based on this selection. We may collect the following information manually, but the objective of this task is to obtain them using the selected images in an automated manner.
The following information will be collected:
    - Favorite colors
    - Favorite image orientation
    - Favorite image sizes (thumbnail images, large images, medium-size images, etc.)
    - Favorite tags


In [7]:
# Prompt the user to input their favorite colors
favorite_colors = input("Enter your favorite colors (separated by commas): ").split(",")

# Prompt the user to input their favorite image orientation
favorite_orientation = input("Enter your favorite image orientation: ")

# Prompt the user to input their favorite image sizes
favorite_sizes = input("Enter your favorite image sizes (separated by commas): ").split(",")

# Prompt the user to input their favorite tags
favorite_tags = input("Enter your favorite tags (separated by commas): ").split(",")

# Build the user-preference profile
user_preference_profile = {
    "favorite_colors": favorite_colors,
    "favorite_orientation": favorite_orientation,
    "favorite_sizes": favorite_sizes,
    "favorite_tags": favorite_tags
}


Now, we are going to analyze the user-preference profile and the car images to determine the best car images for the user based on their preferences. We will use the following criteria to determine the best car images for the user:
    - The predominant colors in the image match the user's favorite colors.
    - The image orientation matches the user's favorite image orientation.
    - The image size matches the user's favorite image sizes.
    - The tags associated with the image match the user's favorite tags.

NameError: name 'convert_color_to_rgb' is not defined