In [8]:
pip install --upgrade tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m771.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting ml-dtypes<0.5.0,>=0.4.0
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting tensorboard<2.19,>=2.18
  Downloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: ml-dtypes, tensorboard, tensorflow
  Attempting uninstall: ml-dtypes
    Found existing installation: ml-dtypes 0.3.2
    Uninstalling ml-dtypes-0.3.2:
      Successfully uninstalled ml-dtypes-0.3.2
  Attempting uninstall

In [26]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Add, GlobalAveragePooling2D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications import MobileNetV2

# Load and prepare the CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# CIFAR-10 classes
class_names = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

# Prepare captions (limit to 5000 samples)
num_samples = 5000
captions = [f"This is a {class_names[label[0]]}" for label in y_train[:num_samples]]

# Tokenize captions
tokenizer = Tokenizer(num_words=5000, oov_token="<unk>")
tokenizer.fit_on_texts(captions)
sequences = tokenizer.texts_to_sequences(captions)
max_len = max([len(seq) for seq in sequences])
padded_captions = pad_sequences(sequences, maxlen=max_len, padding='post')

# Prepare target data
y_train_target = np.array(sequences)[:, 1:]  # Remove first word (start token)
y_train_target = pad_sequences(y_train_target, maxlen=max_len-1, padding="post")

class ImageCaptioningModel(tf.keras.Model):
    def __init__(self, vocab_size, max_len, **kwargs):
        super(ImageCaptioningModel, self).__init__(**kwargs)
        
        # Image processing layers
        self.base_model = MobileNetV2(
            input_shape=(96, 96, 3),
            include_top=False,
            weights='imagenet'
        )
        self.base_model.trainable = False
        
        self.global_pool = GlobalAveragePooling2D()
        self.image_dense = Dense(256, activation='relu')
        self.image_dropout = Dropout(0.3)
        self.repeat_vector = tf.keras.layers.RepeatVector(max_len-1)
        
        # Text processing layers
        self.embedding = Embedding(vocab_size, 256, mask_zero=True)
        self.text_lstm = LSTM(256, return_sequences=True, dropout=0.3)
        
        # Decoder layers
        self.decoder_lstm = LSTM(512, return_sequences=True, dropout=0.3)
        self.decoder_dense1 = Dense(512, activation='relu')
        self.decoder_dropout = Dropout(0.3)
        self.output_dense = Dense(vocab_size, activation='softmax')
        
    def call(self, inputs):
        image_input, text_input = inputs
        
        # Process image
        x_img = self.base_model(image_input)
        x_img = self.global_pool(x_img)
        x_img = self.image_dense(x_img)
        x_img = self.image_dropout(x_img)
        x_img = self.repeat_vector(x_img)
        
        # Process text
        x_text = self.embedding(text_input)
        x_text = self.text_lstm(x_text)
        x_text = x_text[:, :-1, :]  # Remove last timestep
        
        # Combine features
        decoder_inputs = x_img + x_text
        
        # Decode
        x = self.decoder_lstm(decoder_inputs)
        x = self.decoder_dense1(x)
        x = self.decoder_dropout(x)
        outputs = self.output_dense(x)
        
        return outputs

# Prepare the data
print("Preparing training data...")
x_train_resized = tf.image.resize(x_train[:num_samples], (96, 96))
x_train_resized = tf.cast(x_train_resized, tf.float32) / 255.0

# Create and compile model
print("Creating model...")
vocab_size = len(tokenizer.word_index) + 1
model = ImageCaptioningModel(vocab_size, max_len)

# Use a lower learning rate for stability
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Prepare target data for training
y_train_target_sparse = np.expand_dims(y_train_target, axis=-1)

# Create tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "image_input": x_train_resized,
        "text_input": padded_captions
    },
    y_train_target_sparse
))

# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(1000).batch(16)

# Train the model
print("Training model...")
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming `x_train_resized` and `y_train_target_sparse` are already NumPy arrays
x_train_resized_np = x_train_resized  # Use the existing NumPy array
y_train_target_np = y_train_target_sparse  # Use the existing NumPy array

# Split into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train_resized_np, y_train_target_np, test_size=0.2)

# Ensure the data is in the correct shape
# The x_train_resized and y_train_target arrays should be in the shape: (batch_size, height, width, channels)
# and (batch_size, seq_length), respectively

# Create tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(64)
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(64)

# Assuming `model` is already defined and compiled
history = model.fit(
    train_dataset,  # Training data
    epochs=10,      # Number of epochs
    validation_data=val_dataset,  # Validation data
    batch_size=64   # Batch size
)


# Save the model
model.save_weights("captioning_model_mobilenetv2_weights.h5")

Preparing training data...
Creating model...
Training model...


TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got array([   5, 2074, 4891, ..., 4031, 2156, 3208])