<a href="https://colab.research.google.com/github/DevFreak-ui/MMAD/blob/main/Filter_Captions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Filter Captions**

This block filters the dateset and produce one caption for each image instead of five.
This is done by calculating the sematic similarity (Cosine similarity) between the five captions of each image and the caption with the highest score is selected to represent the image.

In [None]:
# Set config values
dataset_path = '/content/drive/MyDrive/flickr8k_captions.csv'

In [None]:
import pandas as pd
import numpy as np
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
# Load your dataset
captions_df = pd.read_csv(dataset_path)

# Verify dataset
captions_df.head(2)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .


In [None]:
!pip install -q sentence-transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/227.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sentence_transformers import SentenceTransformer

# Load the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Similarity Index Calc.
def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
import gc

# Store the Best Caption per Image
best_captions = {}

# Process Each Image to Select the Best Caption
for image in captions_df['image'].unique():
    # Get all captions associated with the current image
    image_captions = captions_df[captions_df['image'] == image]['caption'].tolist()

    # Compute embeddings for each caption
    caption_embeddings = model.encode(image_captions, batch_size=16, show_progress_bar=False)

    # Use the mean of the embeddings as a reference for comparison
    mean_embedding = caption_embeddings.mean(axis=0)

    # Compute cosine similarity between the mean embedding and each caption embedding
    similarities = [cosine_similarity(mean_embedding, embedding) for embedding in caption_embeddings]

    # Select the caption with the highest similarity score
    best_caption = image_captions[np.argmax(similarities)]
    best_captions[image] = best_caption

    # Clear variables that are no longer needed
    del image_captions, caption_embeddings, similarities, mean_embedding
    gc.collect()

In [None]:
# Convert the Best Captions Dictionary to a DataFrame
best_captions_df = pd.DataFrame(list(best_captions.items()), columns=['image', 'best_caption'])

# Save the Results to a New CSV File
best_captions_df.to_csv('flickr8k_best_captions.csv', index=False)

# Display the First Few Rows of the Results
best_captions_df

Unnamed: 0,image,best_caption
0,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
1,1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with...
2,1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front o...
3,1003163366_44323f5815.jpg,A man lays on a bench while his dog sits by him .
4,1007129816_e794419615.jpg,A man wears an orange hat and glasses .
...,...,...
8086,990890291_afc72be141.jpg,A man is doing a wheelie on a mountain bike .
8087,99171998_7cc800ceef.jpg,A group of people sit atop a snowy mountain .
8088,99679241_adc853a5c0.jpg,A tall bird is standing on the sand beside the...
8089,997338199_7343367d7f.jpg,A woman standing near a decorated wall writes .
