In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bhuva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [1]:
import os

def load_captions(captions_file_path):
    """
    Loads and parses the captions from the captions.txt file.

    Args:
        captions_file_path (str): Path to the captions.txt file.

    Returns:
        dict: A dictionary where keys are image filenames and values are lists of captions.
    """
    caption_mapping = {}
    with open(captions_file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            image_filename, caption = line.split(',', 1) # Split at the first comma
            image_filename = image_filename.strip() # clean up filenames
            caption = caption.strip() # clean up caption

            if image_filename not in caption_mapping:
                caption_mapping[image_filename] = []
            caption_mapping[image_filename].append(caption)
    return caption_mapping

if __name__ == '__main__':
    captions_file = r"C:\Users\bhuva\Desktop\image_captioning\data\raw\flickr8k\captions.txt" # Assuming 'Flickr8k.token.txt' is the captions file name
    if not os.path.exists(captions_file):
        print(f"Error: Captions file not found at {captions_file}. Please check the dataset download.")
    else:
        caption_data = load_captions(captions_file)
        # Let's print a few examples to check
        example_filenames = list(caption_data.keys())[:5] # Get the first 5 image filenames
        for filename in example_filenames:
            print(f"Image: {filename}")
            for caption in caption_data[filename]:
                print(f"- {caption}")
            print("-" * 20)

Image: image
- caption
--------------------
Image: 1000268201_693b08cb0e.jpg
- A child in a pink dress is climbing up a set of stairs in an entry way .
- A girl going into a wooden building .
- A little girl climbing into a wooden playhouse .
- A little girl climbing the stairs to her playhouse .
- A little girl in a pink dress going into a wooden cabin .
--------------------
Image: 1001773457_577c3a7d70.jpg
- A black dog and a spotted dog are fighting
- A black dog and a tri-colored dog playing with each other on the road .
- A black dog and a white dog with brown spots are staring at each other in the street .
- Two dogs of different breeds looking at each other on the road .
- Two dogs on pavement moving toward each other .
--------------------
Image: 1002674143_1b742ab4b8.jpg
- A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .
- A little girl is sitting in front of a large painted rainbow .
- A small girl in the grass plays with fingerpain

In [6]:
import re
import nltk # You might need to install nltk: pip install nltk
from nltk.tokenize import word_tokenize # If needed: nltk.download('punkt')
import pickle
def clean_text(text):
    """
    Cleans the input text by lowercasing and removing punctuation.

    Args:
        text (str): Input text caption.

    Returns:
        str: Cleaned text.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation using regex (keep alphanumeric and whitespace)
    return text

def tokenize_caption(text):
    """
    Tokenizes the cleaned text into words.

    Args:
        text (str): Cleaned text caption.

    Returns:
        list: List of words (tokens).
    """
    # Using nltk.word_tokenize for more robust tokenization (handles contractions, etc.)
    tokens = word_tokenize(text)
    return tokens

def preprocess_captions(caption_mapping):
    """
    Preprocesses all captions in the caption mapping.

    Args:
        caption_mapping (dict): Dictionary of image filenames to lists of captions.

    Returns:
        dict: Dictionary with preprocessed captions (lists of tokens).
    """
    preprocessed_caption_mapping = {}
    for image_filename, captions in caption_mapping.items():
        preprocessed_captions = []
        for caption in captions:
            cleaned_caption = clean_text(caption)
            tokens = tokenize_caption(cleaned_caption)
            preprocessed_captions.append(tokens)
        preprocessed_caption_mapping[image_filename] = preprocessed_captions
    return preprocessed_caption_mapping


if __name__ == '__main__':
    # ... (Previous code to load captions) ...

    if os.path.exists(captions_file):
        caption_data = load_captions(captions_file)
        preprocessed_data_file = r"C:\Users\bhuva\Desktop\image_captioning\outputs\preprocessed_captions.pkl"
        preprocessed_caption_data = preprocess_captions(caption_data)


        # Print a few examples of preprocessed captions
        example_filenames = list(preprocessed_caption_data.keys())[:3]
        for filename in example_filenames:
            print(f"Image: {filename}")
            for tokens in preprocessed_caption_data[filename]:
                print(f"- Tokens: {tokens}")
            print("-" * 20)

    if os.path.exists(preprocessed_data_file):
        print(f"Loading preprocessed caption data from: {preprocessed_data_file}")
        with open(preprocessed_data_file, 'rb') as f: # 'rb' mode for reading binary file
            preprocessed_caption_data = pickle.load(f)
        print("Preprocessed data loaded successfully.")

    else:
        print("Preprocessing caption data...")
        if not os.path.exists(captions_file):
            print(f"Error: Captions file not found at {captions_file}. Please check the dataset download.")
        else:
            caption_data = load_captions(captions_file)
            preprocessed_caption_data = preprocess_captions(caption_data)

            print(f"Saving preprocessed caption data to: {preprocessed_data_file}")
            with open(preprocessed_data_file, 'wb') as f: # 'wb' mode for writing binary file
                pickle.dump(preprocessed_caption_data, f) # Save the dictionary
            print("Preprocessed data saved.")

    
        # ... (Rest of your previous code, like caption length analysis, can be adapted if needed for preprocessed tokens) ...

Image: image
- Tokens: ['caption']
--------------------
Image: 1000268201_693b08cb0e.jpg
- Tokens: ['a', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way']
- Tokens: ['a', 'girl', 'going', 'into', 'a', 'wooden', 'building']
- Tokens: ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse']
- Tokens: ['a', 'little', 'girl', 'climbing', 'the', 'stairs', 'to', 'her', 'playhouse']
- Tokens: ['a', 'little', 'girl', 'in', 'a', 'pink', 'dress', 'going', 'into', 'a', 'wooden', 'cabin']
--------------------
Image: 1001773457_577c3a7d70.jpg
- Tokens: ['a', 'black', 'dog', 'and', 'a', 'spotted', 'dog', 'are', 'fighting']
- Tokens: ['a', 'black', 'dog', 'and', 'a', 'tricolored', 'dog', 'playing', 'with', 'each', 'other', 'on', 'the', 'road']
- Tokens: ['a', 'black', 'dog', 'and', 'a', 'white', 'dog', 'with', 'brown', 'spots', 'are', 'staring', 'at', 'each', 'other', 'in', 'the', 'street']
- Tokens: ['two', 'dogs', 'of

In [8]:
# ... (rest of your previous imports and functions: load_captions, clean_text, tokenize_caption, preprocess_captions) ...

# Special tokens
PAD_TOKEN = '<pad>'
START_TOKEN = '<start>'
END_TOKEN = '<end>'
UNK_TOKEN = '<unk>'
SPECIAL_TOKENS = [PAD_TOKEN, START_TOKEN, END_TOKEN, UNK_TOKEN]

def create_vocabulary(preprocessed_caption_data, vocab_threshold=5):
    """
    Creates a vocabulary from the preprocessed captions.

    Args:
        preprocessed_caption_data (dict): Dictionary of image filenames to lists of tokenized captions.
        vocab_threshold (int): Minimum word frequency threshold to be included in the vocabulary.

    Returns:
        dict: word_to_index dictionary (vocabulary).
    """
    word_counts = {}
    for captions_list in preprocessed_caption_data.values():
        for tokens in captions_list:
            for token in tokens:
                word_counts[token] = word_counts.get(token, 0) + 1

    # Filter words based on threshold and create vocabulary
    vocab = {PAD_TOKEN: 0, START_TOKEN: 1, END_TOKEN: 2, UNK_TOKEN: 3} # Start with special tokens
    next_index = len(SPECIAL_TOKENS)
    for word, count in word_counts.items():
        if count >= vocab_threshold:
            vocab[word] = next_index
            next_index += 1

    return vocab

def captions_to_indices(preprocessed_caption_data, vocab, max_length=20): # Setting a default max_length for now
    """
    Converts tokenized captions to sequences of numerical indices and pads them.

    Args:
        preprocessed_caption_data (dict): Dictionary of image filenames to lists of tokenized captions.
        vocab (dict): word_to_index vocabulary.
        max_length (int): Maximum caption length for padding.

    Returns:
        dict: Dictionary of image filenames to lists of numericalized and padded caption sequences.
    """
    indexed_caption_mapping = {}
    for image_filename, captions_list in preprocessed_caption_data.items():
        indexed_captions = []
        for tokens in captions_list:
            indexed_tokens = [vocab.get(token, vocab[UNK_TOKEN]) for token in tokens] # Use UNK for unknown words
            indexed_tokens = [vocab[START_TOKEN]] + indexed_tokens + [vocab[END_TOKEN]] # Add start and end tokens

            # Padding or Truncating
            if len(indexed_tokens) > max_length:
                indexed_tokens = indexed_tokens[:max_length] # Truncate if longer than max_length
            else:
                indexed_tokens = indexed_tokens + [vocab[PAD_TOKEN]] * (max_length - len(indexed_tokens)) # Pad if shorter

            indexed_captions.append(indexed_tokens)
        indexed_caption_mapping[image_filename] = indexed_captions
    return indexed_caption_mapping


if __name__ == '__main__':
    # ... (rest of your __main__ block - loading preprocessed data from pickle) ...

    if os.path.exists(preprocessed_data_file):
        with open(preprocessed_data_file, 'rb') as f:
            preprocessed_caption_data = pickle.load(f)

        print("Creating vocabulary...")
        vocabulary = create_vocabulary(preprocessed_caption_data)
        print(f"Vocabulary size: {len(vocabulary)}")

        print("Converting captions to indices and padding...")
        indexed_caption_data = captions_to_indices(preprocessed_caption_data, vocabulary)
        print("Captions indexed and padded.")

        # Example of indexed captions
        example_filenames = list(indexed_caption_data.keys())[:3]
        for filename in example_filenames:
            print(f"\nImage: {filename}")
            for indexed_tokens in indexed_caption_data[filename]:
                print(f"- Indices: {indexed_tokens}")

        # Save vocabulary and indexed data (optional, but good practice - you can save vocab to a separate file too)
        processed_data_output_file = r"C:\Users\bhuva\Desktop\image_captioning\outputs\preprocessed_captions.pkl"
        print(f"\nSaving processed data (vocabulary and indexed captions) to: {processed_data_output_file}")
        data_to_save = {'vocab': vocabulary, 'indexed_captions': indexed_caption_data}
        with open(processed_data_output_file, 'wb') as f:
            pickle.dump(data_to_save, f)
        print("Processed data saved.")

Creating vocabulary...
Vocabulary size: 2995
Converting captions to indices and padding...
Captions indexed and padded.

Image: image
- Indices: [1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Image: 1000268201_693b08cb0e.jpg
- Indices: [1, 4, 5, 6, 4, 7, 8, 9, 10, 11, 4, 12, 13, 14, 6, 15, 3, 16, 2, 0]
- Indices: [1, 4, 17, 18, 19, 4, 20, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 22, 17, 10, 19, 4, 20, 23, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 22, 17, 10, 24, 14, 25, 26, 23, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 22, 17, 6, 4, 7, 8, 18, 19, 4, 20, 3, 2, 0, 0, 0, 0, 0, 0]

Image: 1001773457_577c3a7d70.jpg
- Indices: [1, 4, 27, 28, 29, 4, 30, 28, 31, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 27, 28, 29, 4, 33, 28, 34, 35, 36, 37, 38, 24, 39, 2, 0, 0, 0, 0]
- Indices: [1, 4, 27, 28, 29, 4, 40, 28, 35, 41, 42, 31, 43, 44, 36, 37, 6, 24, 45, 2]
- Indices: [1, 46, 47, 13, 48, 49, 50, 44, 36, 37, 38, 24, 39, 2, 0, 0, 0, 0, 0, 0]
-

In [10]:
import pickle
import os

processed_data_file = r"C:\Users\bhuva\Desktop\image_captioning\outputs\preprocessed_captions.pkl" # Path to your saved file

if os.path.exists(processed_data_file):
    with open(processed_data_file, 'rb') as f:
        loaded_data = pickle.load(f)

    vocabulary = loaded_data['vocab']
    indexed_captions = loaded_data['indexed_captions']

    print(f"Loaded Vocabulary size: {len(vocabulary)}")
    example_filenames = list(indexed_captions.keys())[:3]
    for filename in example_filenames:
        print(f"\nImage: {filename}")
        for indexed_tokens in indexed_captions[filename]:
            print(f"- Indices: {indexed_tokens}")
else:
    print(f"Error: {processed_data_file} not found.")

Loaded Vocabulary size: 2995

Image: image
- Indices: [1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Image: 1000268201_693b08cb0e.jpg
- Indices: [1, 4, 5, 6, 4, 7, 8, 9, 10, 11, 4, 12, 13, 14, 6, 15, 3, 16, 2, 0]
- Indices: [1, 4, 17, 18, 19, 4, 20, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 22, 17, 10, 19, 4, 20, 23, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 22, 17, 10, 24, 14, 25, 26, 23, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 22, 17, 6, 4, 7, 8, 18, 19, 4, 20, 3, 2, 0, 0, 0, 0, 0, 0]

Image: 1001773457_577c3a7d70.jpg
- Indices: [1, 4, 27, 28, 29, 4, 30, 28, 31, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- Indices: [1, 4, 27, 28, 29, 4, 33, 28, 34, 35, 36, 37, 38, 24, 39, 2, 0, 0, 0, 0]
- Indices: [1, 4, 27, 28, 29, 4, 40, 28, 35, 41, 42, 31, 43, 44, 36, 37, 6, 24, 45, 2]
- Indices: [1, 46, 47, 13, 48, 49, 50, 44, 36, 37, 38, 24, 39, 2, 0, 0, 0, 0, 0, 0]
- Indices: [1, 46, 47, 38, 51, 52, 53, 36, 37, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
