# 1-Import des bibliothèques

In [16]:
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import collections
import numpy as np
import time
import json
from PIL import Image
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import tokenizer_from_json


# 2-Création d'un encodeur et décodeur avec une architecture identique au livrable 3
Paramètres pour le modèle :

In [4]:
embedding_dim = 256
units = 512 # Taille de la couche cachée dans le RNN
top_k = 5000 # Nombre de mots à utiliser dans le vocabulaire
vocab_size = top_k + 1

# 2.1-Création de l'encodeur

In [5]:
class CNN_Encoder(tf.keras.Model):
    # Comme les images sont déjà prétraités par InceptionV3 est représenté sous forme compacte
    # L'encodeur CNN ne fera que transmettre ces caractéristiques à une couche dense
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # forme après fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

# 2.2-Création du décodeur

In [6]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) forme == (batch_size, 64, embedding_dim)

        # forme de la couche cachée == (batch_size, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                             self.W2(hidden_with_time_axis)))

        # Cela vous donne un score non normalisé pour chaque caractéristique de l'image.
        score = self.V(attention_hidden_layer)

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        #Couche dense qui aura pour entrée la sortie du GRU
        self.fc1 = tf.keras.layers.Dense(self.units)
        # Dernière couche dense
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # L'attention est defini par un modèle a part
        context_vector, attention_weights = self.attention(features, hidden)
        # Passage du mot courant à la couche embedding
        x = self.embedding(x)
        # Concaténation
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # Passage du vecteur concaténé à la gru
        output, state = self.gru(x)

        # Couche dense
        y = self.fc1(output)

        y = tf.reshape(y, (-1, x.shape[2]))

        # Couche dense
        y = self.fc2(y)

        return y, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [7]:
# Création de l'encodeur
encoder = CNN_Encoder(embedding_dim)
# Création du décodeur
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

2023-10-25 14:21:17.451166: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-25 14:21:17.459974: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-25 14:21:17.460008: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-25 14:21:17.460828: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-25 14:21:17.460861: I tensorflow/compile

# 3-Initialisation du gestionnaire de checkpoints

In [10]:
checkpoint_path = os.path.abspath("checkpoints/")  # Modifiez ce chemin si nécessaire.
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=tf.keras.optimizers.Adam())  # Assurez-vous d'utiliser les mêmes paramètres d'optimiseur que l'original.
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


# 4-Restaurez le dernier checkpoint

In [11]:
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Dernier checkpoint restauré !")

Dernier checkpoint restauré !


# 5-Utilisation du modèle sur une image

In [14]:
def load_and_preprocess_image(image_path):
    """
    Charge et prétraite l'image pour le modèle.
    
    Args:
    - image_path (str): Chemin vers l'image.
    
    Returns:
    - tf.Tensor: Image tensor prétraitée.
    """
    # Chargez l'image
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img

In [15]:
def generate_caption(image_path, encoder, decoder, tokenizer, max_length):
    """
    Génère une légende pour l'image donnée.
    
    Args:
    - image_path (str): Chemin vers l'image.
    - encoder (tf.keras.Model): Le modèle encodeur.
    - decoder (tf.keras.Model): Le modèle décodeur.
    - tokenizer (tf.keras.preprocessing.text.Tokenizer): Le tokenizer utilisé pour le prétraitement des légendes.
    - max_length (int): La longueur maximale d'une légende.
    
    Returns:
    - str: La légende générée pour l'image.
    """

    # Chargez et pré-traitez l'image
    temp_input = tf.expand_dims(load_and_preprocess_image(image_path), 0)
    img_tensor_val = encoder(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

    # Initialisez l'entrée du décodeur avec le jeton de départ
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    # Réinitialisez l'état caché du décodeur
    hidden = decoder.reset_state(batch_size=1)

    for i in range(max_length):
        predictions, hidden = decoder(dec_input, img_tensor_val, hidden)
        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            break

        dec_input = tf.expand_dims([predicted_id], 0)

    caption = ' '.join(result).replace('<start>', '').replace('<end>', '').strip()
    return caption


In [17]:

# Load the JSON string from the file
with open('tokenizer.json', 'r', encoding='utf-8') as f:
    tokenizer_json = json.load(f)

# Recreate the tokenizer from the JSON string
tokenizer = tokenizer_from_json(tokenizer_json)


In [19]:
image_folder = "./photo/Photo/"

image_file = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder) if filename.endswith('.jpg')]

rid = np.random.randint(0, len(image_file))

image_test = image_file[rid]

caption = generate_caption(image_test, 
                           encoder, 
                           decoder, 
                           tokenizer, 
                           47)
print("Légende générée:", caption)

Image.open(image_test)

ValueError: Exception encountered when calling layer 'cnn__encoder' (type CNN_Encoder).

In this `tf.Variable` creation, the initial value's shape ((2048, 256)) is not compatible with the explicitly supplied `shape` argument ((3, 256)).

Call arguments received by layer 'cnn__encoder' (type CNN_Encoder):
  • x=tf.Tensor(shape=(1, 299, 299, 3), dtype=float32)