In [1]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

2025-01-12 10:23:49.226500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736673829.269872    3619 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736673829.282840    3619 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-12 10:23:49.393237: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(f"GPU memory growth setting failed: {e}")

In [3]:
import tensorflow as tf
from tensorflow.keras import Model, layers
from transformers import TFGPT2Model, GPT2Tokenizer
import numpy as np
from PIL import Image
import os
import json
import random
from tqdm import tqdm


class ImageFeatureExtractor(layers.Layer):
    def __init__(self, output_dim):
        super(ImageFeatureExtractor, self).__init__()
        base_model = tf.keras.applications.MobileNetV2(
            include_top=False,
            weights='imagenet',
            input_shape=(224, 224, 3)
        )
        base_model.trainable = False
        self.cnn = base_model
        self.global_pool = layers.GlobalAveragePooling2D()
        self.projection = layers.Dense(output_dim, activation='relu')

    def call(self, images):
        x = self.cnn(images)
        x = self.global_pool(x)
        return self.projection(x)

class ProjectionLayer(layers.Layer):
    def __init__(self, embedding_dim):
        super(ProjectionLayer, self).__init__()
        self.dense = layers.Dense(embedding_dim)

    def call(self, image_features):
        return self.dense(image_features)

class ImageCaptioningModel(Model):
    def __init__(self, max_length=50, vocab_size=50257):
        super(ImageCaptioningModel, self).__init__()
        self.gpt2 = TFGPT2Model.from_pretrained('gpt2')
        self.gpt2.trainable = False
        self.image_encoder = ImageFeatureExtractor(
            output_dim=self.gpt2.config.hidden_size
        )
        self.projection = ProjectionLayer(self.gpt2.config.hidden_size)
        self.output_layer = layers.Dense(vocab_size, activation='softmax')
        self.max_length = max_length

    def call(self, inputs):
        images, text_tokens = inputs
        image_features = self.image_encoder(images)
        projected_features = self.projection(image_features)
        text_outputs = self.gpt2(text_tokens, return_dict=True)
        text_features = text_outputs.last_hidden_state
        expanded_features = tf.expand_dims(projected_features, axis=1)
        expanded_features = tf.tile(
            expanded_features,
            [1, tf.shape(text_features)[1], 1]
        )
        combined_features = text_features + expanded_features
        outputs = self.output_layer(combined_features)
        return outputs


def preprocess_image(image_path):
    """Load and preprocess a single image."""
    img = Image.open(image_path)
    img = img.convert('RGB')
    img = img.resize((224, 224))
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = tf.keras.applications.mobilenet_v2.preprocess_input(img_array)
    return img_array

def load_coco_annotations(annotations_file):
    """Load COCO dataset annotations."""
    print(f"Loading annotations from {annotations_file}")
    with open(annotations_file, 'r') as f:
        annotations = json.load(f)

    image_captions = {}
    for annotation in annotations['annotations']:
        image_id = str(annotation['image_id']).zfill(12)
        if image_id not in image_captions:
            image_captions[image_id] = []
        image_captions[image_id].append(annotation['caption'])

    print(f"Loaded annotations for {len(image_captions)} images")
    return image_captions


def create_coco_dataset(image_dir, annotations_file, max_length, batch_size=16):
    """Create a dataset from COCO images and captions."""
    print("Loading COCO annotations...")
    image_captions = load_coco_annotations(annotations_file)
    print(f"Loaded {len(image_captions)} image captions.")
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    
    # Adjust max_length to account for the shift between input and target
    effective_length = max_length - 1

    def generator():
        image_ids = list(image_captions.keys())
        while True:
            random.shuffle(image_ids)
            for image_id in image_ids:
                image_path = os.path.join(image_dir, f"{image_id}.jpg")
                if not os.path.exists(image_path):
                    continue

                try:
                    img = preprocess_image(image_path)
                    caption = random.choice(image_captions[image_id])
                    
                    # Add special tokens and encode
                    caption = f"{tokenizer.bos_token}{caption}{tokenizer.eos_token}"
                    tokens = tokenizer.encode(
                        caption,
                        max_length=max_length,
                        padding='max_length',
                        truncation=True
                    )
                    
                    # Create input and target sequences with consistent length
                    input_tokens = tokens[:effective_length]  # First n-1 tokens
                    target_tokens = tokens[1:effective_length+1]  # Shifted sequence
                    
                    # Pad sequences if necessary
                    if len(input_tokens) < effective_length:
                        padding = effective_length - len(input_tokens)
                        input_tokens.extend([tokenizer.pad_token_id] * padding)
                        target_tokens.extend([-100] * padding)
                    
                    # Convert to tensors
                    input_tokens = tf.convert_to_tensor(input_tokens, dtype=tf.int32)
                    target_tokens = tf.convert_to_tensor(target_tokens, dtype=tf.int32)
                    
                    yield (img, input_tokens), target_tokens

                except Exception as e:
                    print(f"Error processing {image_path}: {str(e)}")
                    continue

    output_shapes = (
        (
            tf.TensorShape([224, 224, 3]),      # Image shape
            tf.TensorShape([effective_length])   # Input tokens shape
        ),
        tf.TensorShape([effective_length])       # Target tokens shape
    )

    output_types = (
        (tf.float32, tf.int32),  # Types for (image, input_tokens)
        tf.int32                 # Type for target_tokens
    )

    # Create the dataset from the generator
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_types=output_types,
        output_shapes=output_shapes
    )

    # Apply batching after the dataset is created
    return dataset.batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

def train_model(model, dataset, epochs=100, checkpoint_path='checkpoints/model'):
    """Train the model and save checkpoints."""
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=False,
        reduction=tf.keras.losses.Reduction.NONE
    )
    max_gradient_norm = 1.0
    
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

    steps_per_epoch = 3709
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        total_loss = 0
        num_batches = 0
        
        progress_bar = tqdm(total=steps_per_epoch, desc=f"Training")
        
        for batch_inputs, batch_targets in dataset.take(steps_per_epoch):
            with tf.GradientTape() as tape:
                predictions = model(batch_inputs)
                
                # Calculate loss with proper masking
                mask = tf.not_equal(batch_targets, -100)
                mask = tf.cast(mask, dtype=tf.float32)
                
                # Calculate per-token loss
                token_losses = loss_fn(batch_targets, predictions)
                
                # Apply mask and calculate mean loss
                masked_loss = token_losses * mask
                loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)

            gradients = tape.gradient(loss, model.trainable_variables)
            gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
            total_loss += loss
            num_batches += 1
            progress_bar.update(1)
            progress_bar.set_postfix({'loss': float(loss)})
            
            if num_batches % 100 == 0:
                tf.keras.backend.clear_session()

        progress_bar.close()
        avg_loss = total_loss / num_batches
        print(f"Average Loss: {avg_loss:.4f}")
        
        
        model.save_weights(f"{checkpoint_path}_epoch_from_train{epoch + 1}.weights.h5")


def generate_caption(model, image_path, max_length=50):
    """Generate a caption for a single image."""
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    img = preprocess_image(image_path)
    img = tf.expand_dims(img, 0)
    current_tokens = tf.convert_to_tensor([[tokenizer.bos_token_id]], dtype=tf.int32)
    generated_caption = []

    for _ in range(max_length):
        predictions = model((img, current_tokens))
        next_token = tf.argmax(predictions[:, -1, :], axis=-1)
        token_id = int(next_token.numpy()[0])
        generated_caption.append(token_id)

        if token_id == tokenizer.eos_token_id:
            break

        current_tokens = tf.convert_to_tensor([generated_caption], dtype=tf.int32)

    caption = tokenizer.decode(generated_caption, skip_special_tokens=True)
    return caption


if __name__ == "__main__":
    # Set parameters
    max_length = 50
    epochs = 10
    batch_size = 16
    checkpoint_path = 'checkpoints/image_captioning_model'
    print("set parameteres")
    # COCO dataset paths
    image_dir = 'archive/coco2017/train2017'
    annotations_file = 'archive/coco2017/annotations/captions_train2017.json'

    # Create model and dataset
    model = ImageCaptioningModel(max_length=max_length)
    dataset = create_coco_dataset(
        image_dir,
        annotations_file,
        max_length,
        batch_size=batch_size
    )

    # Train model
    train_model(model, dataset, epochs=epochs, checkpoint_path=checkpoint_path)

    # Test the model (optional)
    test_image_path = 'archive/coco2017/test2017/000000000001.jpg'  # Replace with your test image path
    if os.path.exists(test_image_path):
        generated_caption = generate_caption(model, test_image_path)
        print(f"Generated caption: {generated_caption}")

  from .autonotebook import tqdm as notebook_tqdm


set parameteres


I0000 00:00:1736673832.028444    3619 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22086 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6
All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


Loading COCO annotations...
Loading annotations from archive/coco2017/annotations/captions_train2017.json
Loaded annotations for 118287 images
Loaded 118287 image captions.

Epoch 1/10


Training:   0%|          | 0/3709 [00:00<?, ?it/s]I0000 00:00:1736673838.481438    3619 cuda_dnn.cc:529] Loaded cuDNN version 90300
Training: 100%|██████████| 3709/3709 [32:01<00:00,  1.98it/s, loss=0.844]2025-01-12 10:55:56.510306: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Training: 100%|██████████| 3709/3709 [32:01<00:00,  1.93it/s, loss=0.844]


Average Loss: 1.0186

Epoch 2/10


Training: 100%|██████████| 3709/3709 [30:13<00:00,  2.11it/s, loss=0.738]2025-01-12 11:26:10.010363: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Training: 100%|██████████| 3709/3709 [30:13<00:00,  2.05it/s, loss=0.738]


Average Loss: 0.7774

Epoch 3/10


Training: 100%|██████████| 3709/3709 [30:51<00:00,  2.00it/s, loss=0.752]


Average Loss: 0.7327

Epoch 4/10


Training: 100%|██████████| 3709/3709 [31:02<00:00,  1.95it/s, loss=0.761]2025-01-12 12:28:04.838314: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Training: 100%|██████████| 3709/3709 [31:02<00:00,  1.99it/s, loss=0.761]


Average Loss: 0.7094

Epoch 5/10


Training: 100%|██████████| 3709/3709 [31:04<00:00,  1.99it/s, loss=0.765]


Average Loss: 0.6909

Epoch 6/10


Training: 100%|██████████| 3709/3709 [31:04<00:00,  1.99it/s, loss=0.675]


Average Loss: 0.6788

Epoch 7/10


Training: 100%|██████████| 3709/3709 [31:10<00:00,  1.98it/s, loss=0.733]


Average Loss: 0.6684

Epoch 8/10


Training: 100%|██████████| 3709/3709 [31:10<00:00,  2.01it/s, loss=0.664]2025-01-12 14:32:36.202929: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Training: 100%|██████████| 3709/3709 [31:10<00:00,  1.98it/s, loss=0.664]


Average Loss: 0.6607

Epoch 9/10


Training: 100%|██████████| 3709/3709 [31:09<00:00,  1.98it/s, loss=0.764]


Average Loss: 0.6537

Epoch 10/10


Training: 100%|██████████| 3709/3709 [31:13<00:00,  1.98it/s, loss=0.714]


Average Loss: 0.6481
Generated caption: AA truck parked next to a truck parked next to a truck parked next to a truck parked on the side of a road.


In [6]:
# Test the model (optional)
test_image_path = 'archive/coco2017/test2017/000000000001.jpg'  # Replace with your test image path
if os.path.exists(test_image_path):
    generated_caption = generate_caption(model, test_image_path)
    print(f"Generated caption: {generated_caption}")

Generated caption: AA truck parked next to a truck parked next to a truck parked next to a truck parked on the side of a road.
