In [388]:
from openai import OpenAI
from secret import OPENAI_API_KEY
from PIL import Image
import base64
import os
import json
import re
import pandas as pd
import numpy as np
import io
from tabulate import tabulate

In [389]:
client = OpenAI(api_key=OPENAI_API_KEY)
model = "gpt-4o-mini"

In [390]:
def encode_image(image_path, max_size=(512, 512), quality=80):
    image = Image.open(image_path)

    # Redimensionner l'image
    image.thumbnail(max_size)

    # Convertir en bytes avec compression
    buffer = io.BytesIO()
    image.save(buffer, format="JPEG", quality=quality)

    # Encoder en Base64
    encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")

    return encoded_string

In [391]:
def send_chat_request(message):
    try:
        response = client.chat.completions.create(
            model=model, messages=message
        )

        result = response.choices[0].message.content.strip()
        result = extract_json(result)

        tokens = response.usage.total_tokens

        return result, tokens


    except Exception as e:
        print(f"Erreur OpenAI : {e}")
        return -1, None

In [392]:
def chat_get_key_words(image_paths):

    # Liste pour chaque image et chaque texte associé
    content_list = []
    for image_path in image_paths:
        base64_image = encode_image(image_path)
        image_name = os.path.basename(image_path)
        content_list.append({
            "type": "text",
            "text": f"""Décris moi l'image avec 5 mots-clés. Les mots-clés doivent en priorité inclure des actions, des objets et un lieu si identifiables.
            Retourne le résultat au format JSON : {{ {image_name} : [mot-clé1, mot-clé2, mot-clé3, mot-clé4, mot-clé5] }}
            """
        })
        content_list.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
        })

    messages = [
        {
            "role": "user",
            "content": content_list
        }
    ]

    return send_chat_request(messages)


In [393]:
def extract_json(response_text):
    """
    Extrait la portion JSON (délimitée par {}) de la réponse textuelle pour seulement avoir le dictionnaire et non le texte généré par l'ia.
    """
    match = re.search(r'\{.*\}', response_text, re.DOTALL)
    if match:
        json_str = match.group()
        try:
            return json.loads(json_str)
        except Exception as e:
            print(f"Erreur lors du chargement du JSON : {e}")
            return None
    else:
        print("Aucun JSON trouvé dans la réponse.")
        return None

In [425]:
def chat_get_categories(keywords_output, cat_list):
    """
    Utilise les mots-clés extraits pour regrouper les images similaires en catégories.
    Les images sont identifiées par leur ordre dans la liste.
    """

    # Préparation d'un prompt détaillé incluant le résultat des mots-clés et l'ordre des images
    prompt = f"""Tu es un agent intelligent spécialisé dans le tri et l'organisation d'images provenant d'une galerie d'un téléphone portable.
            L'objectif est de **classer un maximum de photos dans des catégories appropriées** afin de faciliter la gestion de la galerie.

            1. Regroupe les images en fonction de l'action, évènement ou de l'activité qu'elles représentent.
            2. Chaque catégorie est définie par un seul mot-clé descriptif.
            3. Une image ne peut appartenir qu'à une seule catégorie.
            4. Priorise les catégories existantes : {cat_list}. Si une image correspond à l'une d'elles, classe-la dedans.
            5. Si aucune catégorie existante ne convient, crée une nouvelle catégorie proche d’une activité de voyage ou une catégorie plus générique (ex: "Nature", "Repas", "Loisirs")
            6. Réduis autant que possible la catégorie "Autres". N’y mets une image que si elle est vraiment impossible à classer ailleurs.

            - Listes de mots-clés détectés pour chaque image (dans l'ordre) : {keywords_output}

            Retourne le résultat au format JSON : {{ "categorie1": [ "name", "name" ], "categorie2": [ "name", "name" ],...}}"""

    messages = [
        {
            "role": "user",
            "content": prompt
        }
    ]

    return send_chat_request(messages)

In [395]:
directory = "photos_victor"
allowed_extensions = {".jpg", ".jpeg", ".png"}
image_paths = [
    os.path.join(directory, filename)
    for filename in os.listdir(directory)
    if os.path.splitext(filename)[1].lower() in allowed_extensions
]

## Ajout des données au DataFrame

In [396]:
def create_df(image_paths):
    image_list = []
    for path in image_paths:
        image = Image.open(path)
        image_name = os.path.basename(path)
        exifdata = image._getexif()
        date_time, localisation = None, None
        if exifdata:
            for tag_id, value in exifdata.items():
                tag = Image.ExifTags.TAGS.get(tag_id, tag_id)
                if tag == "DateTime":
                    date_time = value
                elif tag == "GPSInfo":
                    localisation = value

            image_list.append((image_name, path, date_time, localisation))

        else:
            print("Aucune donnée EXIF trouvée.")

    df = pd.DataFrame(image_list, columns=["image_name", "path", "date_time", "localisation"])
    df["keywords"] = ""
    df["categories"] = ""

    return df

In [397]:
def add_keywords_to_df(image_data, keywords_output):
    if keywords_output:
        # Mise à jour uniquement pour les images présentes dans keywords_output
        image_data["keywords"] = image_data.apply(
            lambda row: keywords_output[row["image_name"]]
            if row["image_name"] in keywords_output else row["keywords"], axis=1
        )
    else:
        print("Aucun mot clé fourni ! ")
    return image_data


In [398]:
def add_categories_to_df(image_data, categories_output):
    if categories_output:
        #Inversion du dict : on associe une categorie a chaque image
        image_to_categories = {img: cat for cat, images in categories_output.items() for img in images}

        image_data.loc[image_data["image_name"].isin(image_to_categories.keys()), "categories"] = image_data["image_name"].map(image_to_categories)
    else:
        print("Aucune catégorisation trouvée !")

    return image_data

In [399]:
def get_missing_values(dictionnary):
    missing_values = {}
    for key, value in dictionnary.items():
        if value is None or value == "" or value == "nan" or value == "None":
            missing_values.update({key: value})

    return missing_values

In [400]:
def get_missing_path(paths, dictionnary):
    missing_paths = []
    for path in paths:
        image_name = os.path.split(path)[-1]
        #print(f"image_name : {image_name}")
        if image_name in dictionnary.keys() or path in dictionnary.keys():
            missing_paths.append(path)

    return missing_paths

In [401]:
def get_missing_values_path(paths, dictionnary):
    missing_values = get_missing_values(dictionnary)
    missing_paths = get_missing_path(paths, missing_values)
    print(f"Images détectées avec valeurs manquantes : {missing_values.keys()}")
    print(f"Chemins renvoyés pour traitement : {missing_paths}")
    return missing_paths

In [402]:
def checking_all_keywords(df):
    path_images_empty = []
    none_possibilities = [None, "", [], "None", ["None"]]
    for row in df.itertuples():
        keywords = row.keywords
        if isinstance(keywords, float) and pd.isna(keywords):
            path_images_empty.append(row.path)

        elif keywords in none_possibilities:
            path_images_empty.append(row.path)

    return path_images_empty

In [403]:
def keywords_call(df, image_paths, limit_size):
    total_keywords_tokens = 0
    for i in range(0, len(image_paths), limit_size):
        interval = [i, min(i + limit_size, len(image_paths))]
        subset_image_paths = image_paths[interval[0]:interval[1]]
        print(f"Image paths : {subset_image_paths}")

        keywords_output, keywords_tokens = chat_get_key_words(subset_image_paths)
        print(f"Keywords : {keywords_output}")
        total_keywords_tokens += keywords_tokens

        df = add_keywords_to_df(df, keywords_output)

        print(f"Total tokens : {total_keywords_tokens}")

    return df, total_keywords_tokens

In [404]:
def pipeline_keywords(image_paths, limit_size=10):
    image_data = create_df(image_paths)
    new_image_paths = image_paths[:]
    total_keywords_tokens = 0

    all_keywords = False
    only_once = False

    while not all_keywords :
        print("Entree dans le while")
        image_data, keywords_tokens = keywords_call(image_data, new_image_paths, limit_size)
        total_keywords_tokens += keywords_tokens

        if only_once:
            new_row = pd.DataFrame([{"image_name" : "IMG_20241228_132157.jpg","path": "photos_victor/IMG_20241228_132157.jpg"}])
            image_data = pd.concat([image_data, new_row], ignore_index=True)
            only_once = False

        new_image_paths = checking_all_keywords(image_data)
        print(f"Images à traiter après le premier passage : {new_image_paths}")

        if not new_image_paths:
            all_keywords = True

    return image_data, total_keywords_tokens

In [405]:
image_data, keywords_tokens = pipeline_keywords(image_paths, 1)

Entree dans le while
Image paths : ['photos_victor\\IMG_20241228_124125_1.jpg']
Keywords : {'IMG_20241228_124125_1.jpg': ['circulation', 'pluie', 'bande cyclable', 'personnes', 'bâtiments']}
Total tokens : 8634
Image paths : ['photos_victor\\IMG_20241228_124128.jpg']
Keywords : {'IMG_20241228_124128.jpg': ['pluie', 'véhicules', 'piste cyclable', 'bâtiments', 'piétons']}
Total tokens : 17266
Image paths : ['photos_victor\\IMG_20241228_124135.jpg']
Keywords : {'IMG_20241228_124135.jpg': ['cycliste', 'voie cyclable', 'pluie', 'bâtiments', 'rue']}
Total tokens : 25895
Image paths : ['photos_victor\\IMG_20241228_124137.jpg']
Keywords : {'IMG_20241228_124137.jpg': ['faire du vélo', 'route', 'pluie', 'bâtiments', 'circulation']}
Total tokens : 34523
Image paths : ['photos_victor\\IMG_20241228_124140.jpg']
Keywords : {'IMG_20241228_124140.jpg': ['vélos', 'cyclistes', 'pluie', 'bâtiments', 'piste cyclable']}
Total tokens : 43154
Image paths : ['photos_victor\\IMG_20241228_124648.jpg']
Keywords 

In [406]:
print(tabulate(image_data, headers="keys", tablefmt="psql"))

+----+--------------------------------+----------------------------------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+--------------+
|    | image_name                     | path                                         | date_time           | localisation                                                                                                                                                         | keywords                                                               | categories   |
|----+--------------------------------+----------------------------------------------+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [426]:
copy_image_data = image_data.copy()

In [427]:
def checking_all_categories(df):
    keywords_empty_categories = {}
    none_possibilities = [None, "", [], "None", ["None"]]
    for row in df.itertuples():
        check = row.categories
        if isinstance(check, float) and pd.isna(check):
            keywords_empty_categories.update({row.image_name: row.keywords})
        elif check in none_possibilities:
            keywords_empty_categories.update({row.image_name: row.keywords})

    return keywords_empty_categories

In [428]:
def get_cat_list(df):
    cat_list = []
    for row in df.itertuples():
        if row.categories not in cat_list:
            cat_list.append(row.categories)

    return cat_list

In [429]:
def categories_call(df, keywords, limit_size, cat_list):
    total_categories_tokens = 0
    for i in range(0, len(keywords), limit_size):
        interval = [i, min(i + limit_size, len(df))]
        subset_keys = list(keywords.keys())[interval[0]:interval[1]]

        subset_keywords = {key: keywords[key] for key in subset_keys}
        #print(f"Keywords : {subset_keywords}")

        categories_output, categories_tokens = chat_get_categories(subset_keywords, cat_list)

        print(f"Categories : {categories_output}")
        total_categories_tokens += categories_tokens

        df = add_categories_to_df(df, categories_output)
        cat_list = get_cat_list(df)
        print(f"Liste des categories : {cat_list}")

    return df, total_categories_tokens

In [432]:
def pipeline_categories(image_data, limit_size=200):
    new_keywords = image_data.set_index("image_name")["keywords"].to_dict()
    #print(new_keywords)
    total_categories_tokens = 0
    cat_list = ["Paysage", "Ville", "Plage", "Randonnée", "Sport", "Musée", "Restaurant"]

    all_categories = False
    only_once = True

    while not all_categories:
        image_data, categories_tokens = categories_call(image_data, new_keywords, limit_size, cat_list)
        total_categories_tokens += categories_tokens

        if only_once:
            image_data.loc[1, "categories"] = None
            only_once = False


        new_keywords = checking_all_categories(image_data)
        print(f"Kewords à repasser après checking : {new_keywords}")
        cat_list = get_cat_list(image_data)

        if not new_keywords:
            all_categories = True


    return image_data, total_categories_tokens

In [433]:
copy_image_data, categories_tokens = pipeline_categories(copy_image_data, 50)

Erreur OpenAI : Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Categories : -1


TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'

In [413]:
print(tabulate(copy_image_data, headers="keys", tablefmt="psql"))

+----+--------------------------------+----------------------------------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+--------------+
|    | image_name                     | path                                         | date_time           | localisation                                                                                                                                                         | keywords                                                               | categories   |
|----+--------------------------------+----------------------------------------------+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------