<a href="https://colab.research.google.com/github/AhmedBaari/Deep-Learning-Essentials/blob/main/8%20-%20Image%20Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# FIXED: Proper VGG16 initialization and error handling
import os, numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, add
from tensorflow.keras.utils import to_categorical

# Paths
img_dir = '/content/Images/Images/'
caption_file = '/content/Images/captions.txt'

# 1. BUILD VGG16 PROPERLY (ONCE!)
print("Loading VGG16...")
base_model = VGG16(weights='imagenet')
vgg = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)
print("✓ VGG16 loaded")

# 2. EXTRACT FEATURES
print("\nExtracting features...")
features = {}
img_files = [f for f in os.listdir(img_dir) if f.endswith('.jpg')]
print(f"Found {len(img_files)} images")

for i, f in enumerate(img_files[:500]):  # Limit to 500 for speed
    if (i+1) % 50 == 0:
        print(f"  {i+1}/500...")

    try:
        img_path = os.path.join(img_dir, f)
        img = image.load_img(img_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)

        feature = vgg.predict(img_array, verbose=0)
        img_id = f.split('.')[0]
        features[img_id] = feature[0]
    except Exception as e:
        # Skip corrupted images silently
        continue

print(f"✓ Extracted features for {len(features)} images")

if len(features) == 0:
    raise Exception("No features extracted! Check images.")

# 3. LOAD CAPTIONS
print("\nLoading captions...")
captions = {}

with open(caption_file, 'r') as f:
    lines = f.read().split('\n')[1:]

for line in lines:
    if len(line) < 2: continue
    parts = line.split(',', 1)
    if len(parts) < 2: continue

    img_id = parts[0].split('.')[0]

    if img_id in features:
        cap = 'startseq ' + parts[1].lower().strip() + ' endseq'
        captions.setdefault(img_id, []).append(cap)

print(f"✓ Loaded captions for {len(captions)} images")

# 4. TOKENIZE
all_caps = [c for caps in captions.values() for c in caps]
tok = Tokenizer()
tok.fit_on_texts(all_caps)
vocab_size = len(tok.word_index) + 1
max_len = max(len(c.split()) for c in all_caps)

print(f"✓ Vocab: {vocab_size}, Max: {max_len}")

# 5. CREATE SEQUENCES
print("\nCreating sequences...")
X1, X2, y = [], [], []
for img_id, caps in captions.items():
    for cap in caps:
        seq = tok.texts_to_sequences([cap])[0]
        for i in range(1, len(seq)):
            X1.append(features[img_id])
            X2.append(pad_sequences([seq[:i]], maxlen=max_len)[0])
            y.append(to_categorical([seq[i]], num_classes=vocab_size)[0])

X1, X2, y = np.array(X1), np.array(X2), np.array(y)
print(f"✓ Created {len(X1)} sequences")

# 6. BUILD MODEL
print("\nBuilding model...")
inp1 = Input(shape=(4096,))
inp2 = Input(shape=(max_len,))

fe = Dense(256, activation='relu')(inp1)
se = Embedding(vocab_size, 256, mask_zero=True)(inp2)
se = LSTM(256)(se)

decoder = add([fe, se])
decoder = Dense(256, activation='relu')(decoder)
outputs = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[inp1, inp2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

print("✓ Model built")

# 7. TRAIN
print("\nTraining...")
model.fit([X1, X2], y, epochs=5, batch_size=32, verbose=1)

# 8. GENERATE CAPTION
def gen_cap(img_id):
    text = 'startseq'
    for _ in range(max_len):
        seq = pad_sequences([tok.texts_to_sequences([text])[0]], maxlen=max_len)
        pred = np.argmax(model.predict([features[img_id].reshape(1,-1), seq], verbose=0))
        word = [k for k,v in tok.word_index.items() if v==pred]
        if not word or word[0]=='endseq': break
        text += ' ' + word[0]
    return text.replace('startseq','').strip()

# 9. TEST
test_id = list(captions.keys())[0]
print(f"\n✓ Generated: {gen_cap(test_id)}")
print(f"✓ Actual: {captions[test_id][0]}")


Loading VGG16...
✓ VGG16 loaded

Extracting features...
Found 8091 images
  50/500...
  100/500...
  150/500...
  200/500...
  250/500...
  300/500...
  350/500...
  400/500...
  450/500...
  500/500...
✓ Extracted features for 500 images

Loading captions...
✓ Loaded captions for 500 images
✓ Vocab: 2202, Max: 36

Creating sequences...
✓ Created 29623 sequences

Building model...
✓ Model built

Training...
Epoch 1/5
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 191ms/step - loss: 5.2475
Epoch 2/5
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 197ms/step - loss: 3.7976
Epoch 3/5
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 196ms/step - loss: 3.3128
Epoch 4/5
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 184ms/step - loss: 2.9149
Epoch 5/5
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 190ms/step - loss: 2.5919

✓ Generated: a white dog is jumping after a ball in a ball
✓ Actual:

In [None]:
# Download from awsaf49's repository
!wget "https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip"
!unzip -q flickr8k.zip -d Images/
!rm flickr8k.zip


--2025-10-29 03:34:32--  https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-10-29T04%3A18%3A38Z&rscd=attachment%3B+filename%3Dflickr8k.zip&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-10-29T03%3A17%3A43Z&ske=2025-10-29T04%3A18%3A38Z&sks=b&skv=2018-11-09&sig=RJeB5MnUDNYNWBhFaS3ICJz3o2GNmNNZVlbFNIgo8kE%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2MTcxMjQ3MiwibmJmIjoxNzYxNzA4ODcyLCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmN

In [None]:
import os
print(os.listdir('/content'))

['.config', 'Images', 'sample_data']


In [None]:
# RUN THIS FIRST TO DEBUG
import os

# Check what's in your content folder
print("Contents of /content/:")
print(os.listdir('/content/'))

# Check if Images folder exists
if os.path.exists('/content/Images'):
    print("\n✓ Images folder exists")
    files = os.listdir('/content/Images/')
    print(f"Files in Images folder: {len(files)}")
    print(f"First 10 files: {files[:10]}")
else:
    print("\n✗ Images folder NOT found!")
    print("Available folders:", [d for d in os.listdir('/content/') if os.path.isdir(os.path.join('/content/', d))])


Contents of /content/:
['.config', 'Images', 'sample_data']

✓ Images folder exists
Files in Images folder: 2
First 10 files: ['captions.txt', 'Images']
