<a href="https://colab.research.google.com/github/2303A51155/CODSOFT/blob/main/Task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMAGE CAPTIONING

Combine computer vision and natural language processing to build an image captioning AI. Use pre-trained image recognition models like VGG or ResNet to extract features from images, and then use a recurrent neural network (RNN) or transformer-based model to generate captions for those images.

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Add, Input
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import os

# Load the pre-trained VGG16 model
def extract_features(image_path):
    model = VGG16(weights='imagenet', include_top=False, pooling='avg')
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.vgg16.preprocess_input(image)
    features = model.predict(image)
    return features

# Prepare the dataset (images and captions)
def load_dataset(images, captions):
    features = []
    for img in images:
        features.append(extract_features(img))
    return np.array(features), captions

# Tokenize captions
def tokenize_captions(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    vocab_size = len(tokenizer.word_index) + 1
    return tokenizer, vocab_size

# Create the RNN model
def create_model(vocab_size):
    # Image feature input
    image_input = Input(shape=(4096,))
    image_features = Dense(256, activation='relu')(image_input)

    # Caption input
    caption_input = Input(shape=(None,))
    caption_embedding = Embedding(vocab_size, 256)(caption_input)
    caption_lstm = LSTM(256)(caption_embedding)

    # Combine features
    combined = Add()([image_features, caption_lstm])
    output = Dense(vocab_size, activation='softmax')(combined)

    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Generate captions for a new image
def generate_caption(model, tokenizer, image_path, max_length):
    features = extract_features(image_path)
    caption = [tokenizer.word_index['startseq']]
    for _ in range(max_length):
        sequence = pad_sequences([caption], maxlen=max_length)
        prediction = model.predict([features, sequence])
        word = np.argmax(prediction)
        caption.append(word)
        if word == tokenizer.word_index['endseq']:
            break # This line was not indented correctly, causing the error.
    return ' '.join([tokenizer.index_word[i] for i in caption if i in tokenizer.index_word])

# Main function to run the image captioning
def main():
    # Example images and captions
    # Assuming your images are in a folder
    image_folder = 'path/to/your/images'
    images = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder)]
    captions = ['caption for image 1', 'caption for image 2', ...]  # Replace with your captions


    # Load and prepare the dataset
    features, captions = load_dataset(images, captions)

    # Tokenize the captions
    tokenizer, vocab_size = tokenize_captions(captions)

    # Create the model
    model = create_model(vocab_size)

    # Train the model
    # ...

    # Generate captions for new images
    new_image_path = 'path/to/your/new/image.jpg'
    max_length = 30
