In [26]:
import os
import time
import json
from PIL import Image
import urllib.request
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.error import HTTPError

endpoint_url = "https://query.wikidata.org/sparql"
img_data = {}

# Requête pour récupérer les infos de l'entité donnée - Q144/Chien par défaut
def query(wd='Q144', limit=5):
    return f"""SELECT ?item ?itemLabel ?pic
                WHERE {{
                  ?item wdt:P31 wd:{wd}.
                  ?item wdt:P18 ?pic.
                  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
                }}
                LIMIT {limit}"""

def get_results(endpoint_url, query):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Boucle de tentative en cas d'erreur 429 Too Many Requests
    for i in range(3):
        try:
            return sparql.query().convert()
        except HTTPError as e:
            if e.response.status_code == 429:
                retry_after = int(e.response.headers.get('Retry-After', '90'))
                print(f"Too Many Requests. Tentative dans {retry_after} secondes.")
                sleep(retry_after)
            else:
                raise e
    
    raise Exception("Nombre maximum de tentatives pour atteindre Wikidata atteint.")

# Pour créer le dossier des images s'il n'existe pas
def createFolder(name='images'):
    if name not in os.listdir():
        os.mkdir(name)
    else:
        print('Dossier '+name+' déjà existant !')

# Pour vérifier si un dossier est vide - évitons de retélécharger des images inutilement.
def isEmpty(name='images'):
    if len(os.listdir(name)) == 0:
        return True
    return False

# Pour ajouter des éléments à la bdd
def addImg(wd, limit):
    results = get_results(endpoint_url, query(wd, limit))
    for result in results['results']['bindings']:
        name_img = result['item']['value'].split("/")[-1] + "-" + wd # nom donné aux images téléchargées "numéro d'image - numéro de catégorie"
        
        #img_data[name_img] = {'image': result['pic']['value']}
        
        img_data[name_img] = {}
        downloadImage(result['pic']['value'], name_img)
        

# Ajoute les images de toutes les entités spécifiées
def addAllImg(wd, limit):
    for w in wd:
        addImg(w, limit)

# Pour télécharger l'image d'un élément de la bdd
def downloadImage(url, name_img='img0'):
    full_path = 'images/' + name_img + '.jpg'
    urllib.request.urlretrieve(url, full_path)
    time.sleep(1)

# Si les images sont déjà téléchargées on initialise le tableau
def initAlreadyDownload(namefolder='images'):
    for image in os.listdir(namefolder):
        #img_data[image.split(".")[0]] = {'image': ""}
        img_data[image.split(".")[0]] = {}

# Pour télécharger toutes les images - limit par query
def downloadAllImages(wd=['Q144'], limit=5):
    createFolder()
    if not isEmpty():
        print('Images déjà téléchargées !')
        initAlreadyDownload()
        return False
    addAllImg(wd, limit)
    return True 



bdd = ['Q144','Q14660','Q23442','Q28803','Q3305213'] #Chien, drapeau, ile, sandwich, peinture,
downloadAllImages(wd=bdd,limit=25)
print(img_data)


{'Q5270723-Q144': {}, 'Q5288274-Q144': {}, 'Q5375926-Q144': {}, 'Q5515388-Q144': {}, 'Q5619202-Q144': {}, 'Q5686986-Q144': {}, 'Q5859102-Q144': {}, 'Q5902587-Q144': {}, 'Q5905086-Q144': {}, 'Q5970618-Q144': {}, 'Q6072863-Q144': {}, 'Q6104495-Q144': {}, 'Q6155312-Q144': {}, 'Q6188949-Q144': {}, 'Q6304312-Q144': {}, 'Q6343845-Q144': {}, 'Q6417173-Q144': {}, 'Q6469343-Q144': {}, 'Q6537509-Q144': {}, 'Q6819212-Q144': {}, 'Q6962578-Q144': {}, 'Q7033137-Q144': {}, 'Q7084299-Q144': {}, 'Q7123658-Q144': {}, 'Q7126106-Q144': {}, 'Q19010-Q14660': {}, 'Q19374-Q14660': {}, 'Q72298-Q14660': {}, 'Q121688-Q14660': {}, 'Q122482-Q14660': {}, 'Q145637-Q14660': {}, 'Q148002-Q14660': {}, 'Q163008-Q14660': {}, 'Q168935-Q14660': {}, 'Q172446-Q14660': {}, 'Q173201-Q14660': {}, 'Q369833-Q14660': {}, 'Q371718-Q14660': {}, 'Q404150-Q14660': {}, 'Q435309-Q14660': {}, 'Q435723-Q14660': {}, 'Q437841-Q14660': {}, 'Q458651-Q14660': {}, 'Q459788-Q14660': {}, 'Q495495-Q14660': {}, 'Q497887-Q14660': {}, 'Q498887-Q14660

In [27]:
import os
import json
from PIL import Image
import urllib.request
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.error import HTTPError
import time
import shutil
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import webcolors

#Supprimer une image du dossier 
def removeImage(name):
    image_path = "images/" + name
    if os.path.isfile(image_path):
        os.remove(image_path)
        print(f"L'image {name} a été supprimée.")
    else:
        print(f"Impossible de trouver le fichier {image_path}")

#Obtenir les metas d'une image
def getMetaImg(name, imgfile):
    #clés Exif
    DATE_KEY = 36867
    MODEL_KEY = 272
    
    img_data[name].update({
        #'size': [getSizeImg(imgfile.size), imgfile.size],
        'size': getSizeImg(imgfile.size),
        'orientation': getOrientationImg(imgfile.size),
        'format': imgfile.format,
    })
    
    getColorsImg(name, imgfile)
    
    exif_data = imgfile._getexif()
    if exif_data is not None:
        if DATE_KEY in exif_data:
            img_data[name]['date'] = exif_data[DATE_KEY]
        if MODEL_KEY in exif_data:
            img_data[name]['model'] = exif_data[MODEL_KEY]

def getOrientationImg(size): #(largeur,hauteur)
    width,height = size
    return "landscape" if width > height else "portrait" if height > width else "squared"
    
def getSizeImg(size): #(largeur,hauteur)
    width,height = size
    return "large" if width*height>1920*1080 else "small" if width*height<640*480 else "medium"
    
def getColorsImg(name, imgfile, cluster_nbr=2, n_init=1):
    
    img_data[name]['colors'] = []
    
    # Redimensionner l'image pour accélérer le traitement
    imgfile = imgfile.resize((int(imgfile.size[0]/2), int(imgfile.size[1]/2)))

    # Convertir l'image en un tableau numpy
    img_array = np.array(imgfile)
    
    try:
        img_vector = img_array.reshape(-1, 3)

        clusters = MiniBatchKMeans(n_clusters=cluster_nbr, n_init=n_init, random_state=69).fit(img_vector) #Random state pour garder la meme seed ppour toutes les images

        for i, center in enumerate(clusters.cluster_centers_):
            rgb = tuple(map(int, center))
        

            img_data[name]['colors'].append(rgb)
            
    except Exception as e:
        print('Pas de couleur : {}'.format(e))
        #img_data[name]['colors'] = [-1]

def RGBtoName(rgb):
    try:
        # Attempt to find the exact color name
        color_name = webcolors.rgb_to_name(rgb)
    except ValueError:
        # If exact name not found, find the closest color name
        min_difference = None
        closest_color = None
        for name, hex_value in webcolors.CSS3_NAMES_TO_HEX.items():
            r_c, g_c, b_c = webcolors.hex_to_rgb(hex_value)
            difference = sum([(r_c - rgb[0]) ** 2, (g_c - rgb[1]) ** 2, (b_c - rgb[2]) ** 2])
            if min_difference is None or difference < min_difference:
                min_difference = difference
                closest_color = name
        color_name = closest_color
    return color_name


def openImgGetMeta(name): #name = 'img.jpg'
    try:
        with Image.open("images/"+name) as imgfile:
            #print(imgfile.size, imgfile.format)
            name = name.split(".")[0]
            getMetaImg(name, imgfile)

        imgfile.close()
        return True
    except FileNotFoundError as e:
        print('Impossible de trouver le fichier {}: {}'.format("images/"+name, e))
        return False
    except Exception as e:
        print('Erreur lors de l\'ouverture de l\'image {}: {}'.format("images/"+name, e))
        removeImage(name)
        return False
        
#Obtenir les metas de toutes nos images    
def openGetMetaAllImg(namefolder='images'):
    for image in os.listdir(namefolder):
        # Ignore .ipynb_checkpoints and non-image files
        if image.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
            openImgGetMeta(image)


openGetMetaAllImg()


Erreur lors de l'ouverture de l'image images/Q111589124-Q28803: _getexif
Impossible de trouver le fichier images/Q111589124-Q28803
Erreur lors de l'ouverture de l'image images/Q121688-Q14660.jpg: cannot identify image file 'images/Q121688-Q14660.jpg'
L'image Q121688-Q14660.jpg a été supprimée.
Erreur lors de l'ouverture de l'image images/Q122482-Q14660.jpg: cannot identify image file 'images/Q122482-Q14660.jpg'
L'image Q122482-Q14660.jpg a été supprimée.
Erreur lors de l'ouverture de l'image images/Q148002-Q14660.jpg: cannot identify image file 'images/Q148002-Q14660.jpg'
L'image Q148002-Q14660.jpg a été supprimée.
Erreur lors de l'ouverture de l'image images/Q163008-Q14660.jpg: cannot identify image file 'images/Q163008-Q14660.jpg'
L'image Q163008-Q14660.jpg a été supprimée.
Erreur lors de l'ouverture de l'image images/Q168935-Q14660.jpg: cannot identify image file 'images/Q168935-Q14660.jpg'
L'image Q168935-Q14660.jpg a été supprimée.
Erreur lors de l'ouverture de l'image images/Q172

In [30]:
import requests
import json

#Enregistre les metadonnées des images dans un JSON
def exportToJSON(data,name='img_data.json'):
    with open(name, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print("done")
#exportToJSON(data=img_data)

def openJSON(file="img_data.json"):
    # Ouvrir le fichier JSON en mode lecture (fermé automatiquement)
    with open(file, 'r') as f:
        data = json.load(f)
    return data
# Requête API à Wikidata pour récupérer les catégories de l'image
def wikidataAPI(_id,lang):
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={_id}&props=labels|claims&languages={lang}&format=json"
    #if lang != "":
    #    #categorie
    #    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={_id}&props=labels&languages={lang}&format=json"
    #else:
    #    #image
    #    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={_id}&props=claims&format=json"
    response = requests.get(url)
    return json.loads(response.text)


def findTagsImg(_id, lang='en'): #"Q7704028-Q144"
    
    image_id, cat_id = _id.split("-")
    
    # Requête API à Wikidata pour récupérer les catégories de l'image
    
    img_data = wikidataAPI(image_id,lang)
    cat_data = wikidataAPI(cat_id,lang)
    
    # Extraire le libellé de la catégorie dans la langue souhaitée
    main_tag = cat_data.get('entities', {}).get(cat_id, {}).get('labels', {}).get(lang, {}).get('value', '')
    
    # Trouver les catégories dans la réponse de la requête API sur l'image
    #claims = img_data.get("entities", {}).get(image_id, {}).get("claims", {})
    #print([x['mainsnak']['datavalue']['value'] for prop in ["P910", "P373"] for x in claims.get(prop, [])])
    #categories = formatCategory([x['mainsnak']['datavalue']['value'] for prop in ["P910", "P373"] for x in claims.get(prop, [])])
            
    #if main_tag not in categories:
        #categories.append(main_tag)
        
    categories = [main_tag]
    
    return categories


def formatCategory(categories):
    categories_ = []
    
    removeCar = ['(', ')']
    splitCar = [',', ' ']
    
    for category in categories:
        if isinstance(category, str):
            for car in removeCar:
                category = category.replace(car, '')
            for car in splitCar:
                parts = category.split(car)
                if len(parts) > 1:
                    categories_.extend(parts[1:])
                    category = parts[0]
            categories_.append(category.lower())
    
    return categories_


def addTagsJSON(image_id = "Q7704028", categories=[]):
    data = openJSON(file="img_data.json")
    data[image_id]['tags'] = categories


def addAllTagsJSON():
    for _id in data:
        data[_id]['tags'] = findTagsImg(_id)
    return data

data = openJSON(file="img_data.json")
data = addAllTagsJSON()
exportToJSON(data)


done


In [44]:
class User:
    def __init__(self, favorite_pictures=None):
        self._favorite_colors = []
        self._favorite_orientations = []
        self._favorite_sizes = []
        self._favorite_tags = []
        self._favorite_pictures = [] if favorite_pictures is None else favorite_pictures

    def __str__(self):
        return f"User with favorite color {self._favorite_colors}, orientation {self._favorite_orientations}, size {self._favorite_sizes}, tags {self._favorite_tags} and favorite pictures {self._favorite_pictures}"

    def add_favorite_picture(self, picture):
        self._favorite_pictures.append(picture)
        img_data = data[picture.split(".")[0]]
        self._favorite_colors.append(img_data["colors"])
        self._favorite_orientations.append(img_data["orientation"])
        self._favorite_tags.append(img_data["tags"])
        self._favorite_sizes.append(img_data["size"])

    def remove_favorite_picture(self, picture):
        picture_index = self._favorite_pictures.index(picture)
        self._favorite_pictures.pop(picture_index)
        self._favorite_colors.pop(picture_index)
        self._favorite_orientations.pop(picture_index)
        self._favorite_tags.pop(picture_index)
        self._favorite_sizes.pop(picture_index)

    def get_favorite_pictures(self):
        return self._favorite_pictures

    def get_favorite_tags(self):
        return self._favorite_tags

    def get_favorite_orientations(self):
        return self._favorite_orientations

    def get_favorite_colors(self):
        return self._favorite_colors

    def get_favorite_sizes(self):
        return self._favorite_sizes


In [45]:
# Créer une liste d'utilisateurs vides
number_users = 15
users = [None] * number_users

# Créer un utilisateur avec une liste aléatoire de photos préférées
for i in range(number_users):

    number_pictures_liked = random.randint(10, 30)

    max_pictures_liked = min(len(data), 30)
    if max_pictures_liked == 0:
        print("Il n'y a pas d'images dans le répertoire.")
    else:
        number_pictures_liked = random.randint(1, max_pictures_liked)

    user = User(favorite_pictures=random.sample(list(data.keys()), k=number_pictures_liked))
    users[i] = user
    print(user.get_favorite_pictures())

# Afficher les photos préférées du premier utilisateur
if users[0] is not None:
    print(users[0].get_favorite_pictures())

['Q113945861-Q28803', 'Q7792-Q23442', 'Q9679-Q23442', 'Q6059-Q23442', 'Q5619202-Q144', 'Q404150-Q14660', 'Q4648-Q23442', 'Q636537-Q3305213']
['Q3757-Q23442', 'Q5859102-Q144', 'Q2919135-Q28803', 'Q633587-Q3305213', 'Q630121-Q3305213', 'Q5515388-Q144', 'Q148002-Q14660', 'Q611583-Q3305213', 'Q497887-Q14660', 'Q6962578-Q144', 'Q109353074-Q28803', 'Q641666-Q3305213', 'Q3136-Q23442']
['Q6188949-Q144', 'Q5288274-Q144', 'Q459788-Q14660', 'Q611583-Q3305213', 'Q5859102-Q144', 'Q6059-Q23442', 'Q172446-Q14660', 'Q3049-Q23442', 'Q6537509-Q144', 'Q5736147-Q28803', 'Q371718-Q14660', 'Q614986-Q3305213', 'Q636537-Q3305213', 'Q458651-Q14660', 'Q626352-Q3305213', 'Q3237-Q23442', 'Q6469343-Q144', 'Q498887-Q14660', 'Q6962578-Q144', 'Q6072863-Q144', 'Q3492-Q23442', 'Q630121-Q3305213', 'Q3757-Q23442', 'Q3107-Q23442']
['Q6059-Q23442', 'Q6155312-Q144', 'Q626791-Q3305213', 'Q122482-Q14660', 'Q19010-Q14660', 'Q7084299-Q144', 'Q1462-Q23442', 'Q3136-Q23442', 'Q435723-Q14660', 'Q437841-Q14660', 'Q609986-Q3305213']


In [34]:
import pandas as pd 
dataFrame = pd.read_json('img_data.json').T
dataFrame

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,size,orientation,format,colors,tags,date,model
Q5270723-Q144,small,landscape,JPEG,"[[90, 90, 90], [175, 175, 175]]",[dog],,
Q5288274-Q144,large,landscape,JPEG,"[[40, 38, 26], [215, 214, 212]]",[dog],2018:07:21 12:02:43,LGM-K121L����������������������
Q5375926-Q144,large,portrait,JPEG,"[[43, 40, 41], [157, 145, 134]]",[dog],,
Q5515388-Q144,large,landscape,JPEG,"[[194, 187, 177], [122, 111, 107]]",[dog],2007:11:02 15:17:17,KODAK CX7330 ZOOM DIGITAL CAMERA
Q5619202-Q144,small,portrait,JPEG,"[[177, 177, 177], [72, 72, 72]]",[dog],,
...,...,...,...,...,...,...,...
Q641666-Q3305213,large,portrait,JPEG,"[[40, 33, 27], [181, 166, 146]]",[painting],,
Q642023-Q3305213,medium,landscape,JPEG,"[[87, 71, 51], [183, 164, 138]]",[painting],,
Q642198-Q3305213,large,portrait,JPEG,"[[180, 160, 124], [77, 54, 28]]",[painting],2010:04:07 01:32:24,
Q644106-Q3305213,large,portrait,JPEG,"[[119, 107, 97], [39, 28, 20]]",[painting],,
