# Metal enjoyer

## Part 1: Data collect

This code retrieves information about actors from Wikidata using a SPARQL query, downloads their images, extracts EXIF metadata and dominant colors from the images, and stores all the information in a JSON file for further analysis.

In [1]:
# Installation
!pip install sparqlwrapper
!pip install ipywidgets
!pip install numpy
!pip install scikit-learn
!pip install exifread



In [3]:
import requests
import os
import sys
import shutil
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import matplotlib.colors as mcolors

from time import sleep
from PIL import Image
from PIL.ExifTags import TAGS
from sklearn.cluster import KMeans


# User-Agent pour identifier ton projet auprès des API
HEADERS = {
    "User-Agent": "EngineeringStudend_DataMining_Project/0.0.1 (antoine.perrono@cpe.fr)"
}

# Définition de l'URL du point d'accès SPARQL de Wikidata
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Requête SPARQL
SPARQL_QUERY = """
SELECT DISTINCT ?album ?albumLabel ?image ?genreLabel ?bandLabel ?date ?nbTracks
WHERE {
  ?album wdt:P31 wd:Q482994;     # album
         wdt:P136 ?genre;        # Genre musical
         wdt:P175 ?band;
         wdt:P577 ?date.
  OPTIONAL { ?album wdt:P18 ?image }          # img
  OPTIONAL { ?album wdt:P2635 ?nbTracks }     # Nombre de pistes
  VALUES ?genre { wd:Q183862 wd:Q377910 wd:Q542703 wd:Q484344 wd:Q475221 }  # Genre 
  FILTER(YEAR(?date) > 2010)
         
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 100
"""

# Fonction pour exécuter la requête SPARQL
def execute_sparql_query():
    response = requests.get(SPARQL_ENDPOINT, params={"query": SPARQL_QUERY, "format": "json"}, headers=HEADERS)
    response.raise_for_status()  # Vérifie si la requête a réussi
    return response.json()

def download_image(url, filename):
    try:
        # Télécharger l'image
        response = requests.get(url, stream=True, headers=HEADERS)
        response.raise_for_status()
        
        # Sauvegarde de l'image
        with open(filename, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)

        # Extraction des métadonnées EXIF
        exif_data = extract_exif(filename)
        
        return filename, exif_data  # Retourne le chemin + les EXIF extraits

    except requests.RequestException:
        return None, None

def extract_exif(image_path):
    """ Extrait les métadonnées EXIF d'une image et les rend JSON-compatibles. """
    exif_data = {"Favorite": "NotFavorite", "DominantColor1": "Undefined"}
    try:
        image = Image.open(image_path)
        exif = image._getexif()
        if not exif:
            return exif_data
        
        for tag, value in exif.items():
            tag_name = TAGS.get(tag, tag)  # Convertir l'ID en nom lisible
            if tag_name == "StripByteCounts":
                continue
            
            # Conversion des objets IFDRational en float
            if isinstance(value, tuple):  
                value = tuple(float(v) if isinstance(v, (int, float)) else str(v) for v in value)
            elif isinstance(value, bytes):
                value = value.decode(errors="ignore")  # Convertir en string si possible
            elif hasattr(value, "numerator") and hasattr(value, "denominator"):
                value = float(value.numerator) / float(value.denominator)  # Conversion des IFDRational
            
            exif_data[tag_name] = value

        return exif_data

    except Exception:
        return exif_data


# Fonction unique pour récupérer la cover ET le nombre de pistes via MusicBrainz
def get_musicbrainz_data(artist, album):
    search_url = f"https://musicbrainz.org/ws/2/release-group/?query=artist:{artist} AND release:{album}&fmt=json"
    
    try:
        response = requests.get(search_url, headers=HEADERS).json()
        release_groups = response.get("release-groups", [])

        if not release_groups:
            return None, None # Aucune donnée trouvée

        mbid = response["release-groups"][0]["id"]  # ID de l'album


        # Récupérer l'image si disponible
        image_url = f"https://coverartarchive.org/release-group/{mbid}/front"
        mbid = response["release-groups"][0]["releases"][0]["id"]  # ID de l'album

        # Récupération du nombre de pistes
        release_details_url = f"https://musicbrainz.org/ws/2/release/{mbid}?inc=recordings&fmt=json"
        tracks_data = requests.get(release_details_url, headers=HEADERS).json()
        nb_tracks = len(tracks_data["media"][0]["tracks"]) if "media" in tracks_data else None

        return image_url, nb_tracks
    except (IndexError, KeyError, requests.RequestException):
        return None, None  # En cas d'erreur, on renvoie None

print("Beginning of data importing...")

# Exécuter la requête SPARQL
data = execute_sparql_query()

albums = {}

img_folder = "img"
# Supprimer le dossier s'il existe
if os.path.exists(img_folder):
    shutil.rmtree(img_folder)

# Recréer un dossier vide
os.makedirs(img_folder)

for item in data["results"]["bindings"]:
    album_name = item.get("albumLabel", {}).get("value", "Unknown Album")
    artist_name = item.get("bandLabel", {}).get("value", "Unknown Artist")
    genre = item.get("genreLabel", {}).get("value", "Unknown Genre")
    release_date = item.get("date", {}).get("value", "Unknown Date")
    nb_tracks = item.get("nbTracks", {}).get("value", None)
    image_url = item.get("image", {}).get("value", None)

    # Si aucune image ou aucun nombre de pistes, on utilise MusicBrainz
    if not image_url or not nb_tracks:
        mb_image, mb_tracks = get_musicbrainz_data(artist_name, album_name)
        image_url = image_url or mb_image  # Priorité à Wikidata, sinon MusicBrainz
        nb_tracks = nb_tracks or mb_tracks  # Priorité à Wikidata, sinon MusicBrainz

    # Si toujours pas d'image, on ignore cet album
    if not image_url:
        continue

    image_filename = f"img/{album_name.replace(' ', '_')}.jpg"
    filename, exif = download_image(image_url, image_filename)

    # Ajouter à la liste
    albums[image_filename] ={
        "album": album_name,
        "artist": artist_name,
        "genre": genre,
        "release_date": release_date,
        "tracks": nb_tracks if nb_tracks else "Unknown",
        "image": image_filename,
        "img_exif": exif or {"Favorite": "NotFavorite", "DominantColor1": "Undefined"}
    }
    sleep(1)


# Sauvegarde des résultats en JSON
with open("albums.json", "w", encoding="utf-8") as f:
    json.dump(albums, f, indent=4)

print("All data had been imported.")
print(f"{len(albums)} albums traités et sauvegardés.")

print("Dominant color begin ...")

def simplify_color(rgb):
    r, g, b = rgb
    max_val = max(r, g, b)
    if max_val == r:
        return 'red'
    elif max_val == g:
        return 'green'
    elif max_val == b:
        return 'blue'
    return 'Other'

# Dominant color
n = 1  # How many dominant color we want
default_color = (255)
image_folder = "./img/"
image_files = os.listdir(image_folder)
json_file = "./albums.json"

for idx, image_file in enumerate(image_files):
    if image_file.endswith(('.jpg', '.jpeg')):
        image_path = os.path.join(image_folder, image_file)
        img = Image.open(image_path)
        img = img.convert("RGB")
        np_img = np.array(img)

        numarray = np.array(img.getdata(), np.uint8)
        clusters = KMeans(n_clusters=n, n_init=2)
        clusters.fit(numarray)
        cluster_centers = clusters.cluster_centers_
        colors_list = [tuple(map(int, cluster_centers[i])) for i in range(n)]

        simplified_colors_list = [simplify_color(color) for color in colors_list]  # Simplify colors

        # Update the JSON data part as below
        with open(json_file, 'r+') as f:
            data = json.load(f)
            img_key = f"img/{image_file}"
            if data[img_key]:
                for i, color in enumerate(simplified_colors_list, start=1):
                    data[img_key]["img_exif"][f"Favorite"] = "NotFavorite"
                    data[img_key]["img_exif"][f"DominantColor{i}"] = color
                f.seek(0)
                json.dump(data, f, indent=4)
                f.truncate()


print("Dominant color done")


Beginning of data importing...
All data had been imported.
97 albums traités et sauvegardés.
Dominant color begin ...
Dominant color done


## Part 2: Étiquetage et annotation


In [4]:
import json
import os
import random
import ipywidgets as widgets
from IPython.display import display
from PIL import Image

# Charger les données depuis albums.json
with open("albums.json", "r", encoding="utf-8") as f:
    albums_data = json.load(f)

# Fonction pour récupérer 20 albums aléatoires
def get_random_albums(num=20):
    return random.sample(list(albums_data.values()), min(num, len(albums_data)))

# Fonction pour afficher un album
def display_album(album_info):
    # Charger l'image si disponible
    image_path = album_info["image"]
    if os.path.exists(image_path):
        img = Image.open(image_path)
    else:
        img = None

    # Création des widgets
    album_label = widgets.HTML(f"<b>{album_info['album']}</b>")
    artist_label = widgets.HTML(f"Artist: {album_info['artist']}")
    genre_label = widgets.HTML(f"Genre: {album_info['genre']}")
    year_label = widgets.HTML(f"Year: {album_info['release_date'][:4]}")
    
    image_widget = widgets.Image(value=open(image_path, "rb").read(), format='jpg', width=100, height=100) if img else widgets.Label("No Image")
    
    # Mise en page
    album_box = widgets.HBox([
        image_widget,
        widgets.VBox([album_label, artist_label, genre_label, year_label])
    ])
    
    return album_box

# Fonction pour afficher l'interface
def show_albums():
    albums = get_random_albums(20)
    album_widgets = [display_album(album) for album in albums]
    display(widgets.VBox(album_widgets))

# Afficher les albums
show_albums()


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…