# Flickr8K Image Captioning using InceptionV3 + LSTM (Updated)

โปรเจค Image Captioning ที่ใช้ InceptionV3 (State-of-the-art CNN) สำหรับ extract features และ LSTM สำหรับ generate captions

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import matplotlib.pyplot as plt
from PIL import Image
import string
from tqdm import tqdm

# TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

print(f"TensorFlow version: {tf.__version__}")

## 2. Download Flickr8k Dataset

In [None]:
import urllib.request
import zipfile

# Create directories
os.makedirs('data', exist_ok=True)

# Download dataset
def download_flickr8k():
    base_url = "https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/"
    files = ["Flickr8k_Dataset.zip", "Flickr8k_text.zip"]
    
    for file in files:
        filepath = f"data/{file}"
        if not os.path.exists(filepath):
            print(f"Downloading {file}...")
            urllib.request.urlretrieve(base_url + file, filepath)
            print(f"Extracting {file}...")
            with zipfile.ZipFile(filepath, 'r') as zip_ref:
                zip_ref.extractall('data')
        else:
            print(f"{file} already exists")

download_flickr8k()

# Set paths
images_path = 'data/Images/' # Corrected path
captions_file = 'data/Flickr8k.token.txt'
train_images_file = 'data/Flickr_8k.trainImages.txt'
test_images_file = 'data/Flickr_8k.testImages.txt'

## 3. Load and Preprocess Captions

In [None]:
def load_captions(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def create_captions_dict(text):
    captions = {}
    for line in text.split('\n'):
        tokens = line.split('\t')
        if len(tokens) < 2: continue
        image_id, caption = tokens[0], tokens[1]
        image_id = image_id.split('.')[0]
        if image_id not in captions:
            captions[image_id] = []
        captions[image_id].append(caption)
    return captions

def clean_captions(captions):
    table = str.maketrans('', '', string.punctuation)
    for key, caption_list in captions.items():
        for i in range(len(caption_list)):
            caption = caption_list[i]
            caption = caption.lower()
            caption = caption.translate(table)
            caption = ' '.join([word for word in caption.split() if len(word) > 1])
            caption = 'startseq ' + caption + ' endseq'
            caption_list[i] = caption

text = load_captions(captions_file)
captions = create_captions_dict(text)
clean_captions(captions)
print(f"Total images: {len(captions)}")

## 4. Extract Features using InceptionV3

In [None]:
IMAGE_SIZE = (299, 299)

def extract_features(directory):
    model = InceptionV3(weights='imagenet')
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())
    
    features = {}
    image_files = [f for f in os.listdir(directory) if f.endswith('.jpg')]
    
    for image_file in tqdm(image_files, desc="Extracting Features"):
        image_path = os.path.join(directory, image_file)
        image = load_img(image_path, target_size=IMAGE_SIZE)
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        
        feature = model.predict(image, verbose=0)
        image_id = image_file.split('.')[0]
        features[image_id] = feature
        
    return features

features_file = 'features_inception.pkl'

if os.path.exists(features_file):
    print("Loading existing features...")
    with open(features_file, 'rb') as f:
        features = pickle.load(f)
else:
    print("Extracting features... (This may take a while)")
    features = extract_features(images_path)
    with open(features_file, 'wb') as f:
        pickle.dump(features, f)

print(f"Features shape: {features[list(features.keys())[0]].shape}")

## 5. Prepare Tokenizer and Data

In [None]:
all_captions = []
for key in captions:
    for caption in captions[key]:
        all_captions.append(caption)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in all_captions)

print(f"Vocab Size: {vocab_size}")
print(f"Max Length: {max_length}")

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save config
import json
with open('config.json', 'w') as f:
    json.dump({'max_length': max_length, 'vocab_size': vocab_size, 'model_type': 'inception_v3'}, f)

# Determine training set
with open(train_images_file, 'r') as f:
    train_ids = set([line.split('.')[0] for line in f.read().split('\n') if line])
    
# Determine test set
with open(test_images_file, 'r') as f:
    test_ids = set([line.split('.')[0] for line in f.read().split('\n') if line])

print(f"Training images: {len(train_ids)}")
print(f"Test images: {len(test_ids)}")

## 6. Create Data Generator

In [None]:
def data_generator(captions, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = [], [], []
    n = 0
    while True:
        for image_id, caption_list in captions.items():
            if image_id not in train_ids or image_id not in features: continue
            
            feature = features[image_id][0]
            
            for caption in caption_list:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
                    n += 1
                    if n == batch_size:
                        yield (np.array(X1).astype('float32'), np.array(X2).astype('int32')), np.array(y).astype('float32')
                        X1, X2, y = [], [], []
                        n = 0

## 7. Model Training

In [None]:
BATCH_SIZE = 32
EPOCHS = 20

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Setup Dataset Pipeline
steps = len(train_ids) * 5 // BATCH_SIZE

def generator_wrapper():
    gen = data_generator(captions, features, tokenizer, max_length, vocab_size, BATCH_SIZE)
    for batch in gen:
        yield batch

dataset = tf.data.Dataset.from_generator(
    generator_wrapper,
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),
            tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)
        ),
        tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)
    )
)

checkpoint = ModelCheckpoint('best_model_inception.keras', monitor='loss', save_best_only=True, verbose=1)
early_stop = EarlyStopping(monitor='loss', patience=3)

print("Starting training...")
history = model.fit(
    dataset,
    epochs=EPOCHS,
    steps_per_epoch=steps,
    callbacks=[checkpoint, early_stop],
    verbose=1
)

model.save('final_model_inception.keras')

## 8. Generate Captions

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word(yhat, tokenizer)
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').strip()

## 9. Plot Training History

In [None]:
if 'history' in globals():
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train'], loc='upper right')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train'], loc='lower right')

    plt.tight_layout()
    plt.show()

## 10. Evaluate on Test Set

In [None]:
def evaluate_model(model, features, captions, tokenizer, max_length, num_samples=5):
    sample_ids = list(captions.keys())[:num_samples]
    
    for image_id in sample_ids:
        if image_id not in features: continue
            
        actual_captions = captions[image_id]
        photo = features[image_id]
        predicted_caption = generate_caption(model, tokenizer, photo, max_length)
        
        image_path = os.path.join(images_path, image_id + '.jpg')
        image = Image.open(image_path)
        plt.figure(figsize=(10, 4))
        plt.imshow(image)
        plt.axis('off')
        plt.title(f'Predicted: {predicted_caption}', fontsize=12, wrap=True)
        plt.show()
        
        print(f"\nImage ID: {image_id}")
        print(f"Predicted Caption: {predicted_caption}")
        print("\nActual Captions:")
        for i, caption in enumerate(actual_captions, 1):
            clean_caption = caption.replace('startseq', '').replace('endseq', '').strip()
            print(f"  {i}. {clean_caption}")
        print("-" * 80)

print("Evaluating model on test images...\n")
test_captions = {k: v for k, v in captions.items() if k in test_ids}
evaluate_model(model, features, test_captions, tokenizer, max_length, num_samples=10)

## 11. BLEU Score Evaluation

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu(model, features, captions, tokenizer, max_length):
    actual, predicted = [], []
    
    for image_id, caption_list in tqdm(captions.items(), desc="Calculating BLEU"):
        if image_id not in features:
            continue
        
        photo = features[image_id]
        pred_caption = generate_caption(model, tokenizer, photo, max_length)
        
        references = [caption.replace('startseq', '').replace('endseq', '').strip().split() 
                     for caption in caption_list]
        
        actual.append(references)
        predicted.append(pred_caption.split())
    
    bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
    bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
    
    print(f"BLEU-1: {bleu1:.4f}")
    print(f"BLEU-2: {bleu2:.4f}")

print("\nCalculating BLEU scores on test set...")
evaluate_bleu(model, features, test_captions, tokenizer, max_length)

## 12. Generate Caption for Custom Image

In [None]:
def predict_caption_for_image(image_path):
    model_inc = InceptionV3(weights='imagenet')
    feature_extractor = Model(inputs=model_inc.inputs, outputs=model_inc.layers[-2].output)
    
    image = load_img(image_path, target_size=(299, 299))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    
    feature = feature_extractor.predict(image, verbose=0)
    caption = generate_caption(model, tokenizer, feature, max_length)
    
    plt.figure(figsize=(8, 8))
    plt.imshow(Image.open(image_path))
    plt.axis('off')
    plt.title(f'Caption: {caption}', fontsize=14)
    plt.show()
    
    return caption

# Usage:
# predict_caption_for_image('path/to/image.jpg')