In [None]:
# remove repeated words
# Step 2 After Caption Generation

import json
import re

with open("unfilteredCaption.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

def has_repeated_words(caption):
    words = caption.split()
    word_counts = {word: words.count(word) for word in words}
    return any(count > 1 for count in word_counts.values())

updated_json_data = {key: value for key, value in json_data.items() if not has_repeated_words(value)}

with open("filtered_captions.json", "w", encoding="utf-8") as f:
    json.dump(updated_json_data, f, indent=4)


In [None]:
# Remove Image Not in filtered_captions.json
# Step 3 After Step 2

import os
import shutil
import json

with open("filtered_captions.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

dataset_folder = "originalDataset"
new_folder = "dataset_json"

if not os.path.exists(new_folder):
    os.makedirs(new_folder)

image_names = set(json_data.keys())

for image_file in os.listdir(dataset_folder):
    if image_file not in image_names:
        image_path = os.path.join(dataset_folder, image_file)
        new_image_path = os.path.join(new_folder, image_file)
        shutil.move(image_path, new_image_path)


In [None]:
# calculate average clip score
# Step 4 After Step 3

import os
import json
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

def get_features(image_path, caption):
    image = Image.open(image_path)
    inputs = processor(text=caption, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    return outputs.image_embeds, outputs.text_embeds

def calculate_similarity(image_path, caption):
    image_features, text_features = get_features(image_path, caption)
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
    similarity = cosine_similarity(image_features.detach().numpy(), text_features.detach().numpy())
    return similarity[0][0]

def calculate_average_clip_score(image_folder, json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)

    similarity_scores = []

    for image_name, caption in tqdm(data.items(), desc="Processing Images", unit="image"):
        image_path = os.path.join(image_folder, image_name)
        
        if os.path.exists(image_path):
            similarity_scores.append(calculate_similarity(image_path, caption))

    if similarity_scores:
        print(f"Average CLIP Score: {np.mean(similarity_scores):.4f}")

image_folder = "filteredDataset"
json_file = "filtered_captions.json"

calculate_average_clip_score(image_folder, json_file)

In [None]:
# Filter caption with image
# Step 5 After Step 4

import os
import json
import shutil
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

def get_features(image_path, caption):
    image = Image.open(image_path)
    inputs = processor(text=caption, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    return outputs.image_embeds, outputs.text_embeds

def calculate_similarity(image_path, caption):
    image_features, text_features = get_features(image_path, caption)
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
    similarity = cosine_similarity(image_features.detach().numpy(), text_features.detach().numpy())
    return similarity[0][0]

def filter_and_copy_images(image_folder, json_file, output_folder, output_json_file, threshold=0.2688):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    with open(json_file, 'r') as f:
        data = json.load(f)

    high_threshold_data = {}

    for image_name, caption in tqdm(data.items(), desc="Processing Images", unit="image"):
        image_path = os.path.join(image_folder, image_name)
        
        if os.path.exists(image_path):
            similarity_score = calculate_similarity(image_path, caption)
            
            if similarity_score > threshold:
                new_image_path = os.path.join(output_folder, image_name)
                shutil.copy(image_path, new_image_path)
                high_threshold_data[image_name] = caption

    with open(output_json_file, 'w') as f:
        json.dump(high_threshold_data, f, indent=4)

image_folder = "filteredDataset"
json_file = "filtered_captions.json"
output_folder = "filteredImageV1"
output_json_file = "filtered_captionsV1.json"

filter_and_copy_images(image_folder, json_file, output_folder, output_json_file)

In [1]:
# txt file formatter From filtered_captionsV1.json Json
# Step 6 After Step 5

import json

with open("processed/filtered_captions.json", "r") as infile:
    data = json.load(infile)

with open("training_data/pseudo_caption/pseudo_caption.txt", "w") as outfile:
    for key, value in data.items():
        if isinstance(value, list):
            for i, caption in enumerate(value):
                outfile.write(f"{key}#{i}\t{caption}\n")
        else:
            outfile.write(f"{key}#0\t{value}\n")

In [None]:
# rename images and json
# Step 7 After Step 6

import os

image_folder = "training_data/dataset"
caption_file = "training_data/pseudo_caption.txt"

with open(caption_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

start_index = 0
updated_lines = []

for i, line in enumerate(lines):
    old_name, caption = line.strip().split("\t", 1)
    old_image_name = old_name.split("#")[0]
    new_image_name = f"image{start_index + i}.jpg"
    new_caption = f"{new_image_name}#0\t{caption}"
    old_image_path = os.path.join(image_folder, old_image_name)
    new_image_path = os.path.join(image_folder, new_image_name)
    if os.path.exists(old_image_path):
        os.rename(old_image_path, new_image_path)
    updated_lines.append(new_caption)

with open(caption_file, "w", encoding="utf-8") as f:
    f.write("\n".join(updated_lines))

In [None]:
# Remove identicial image From filteredImageV1 folder
# If necessary

import os
import json
import hashlib

def get_image_hash(image_path):
    with open(image_path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

def remove_identical_images(datasetV1_folder, datasetV2_folder, json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    datasetV2_hashes = {get_image_hash(os.path.join(datasetV2_folder, img)): img for img in os.listdir(datasetV2_folder) if os.path.isfile(os.path.join(datasetV2_folder, img))}
    
    images_to_remove = [img for img in os.listdir(datasetV1_folder) if os.path.isfile(os.path.join(datasetV1_folder, img)) and get_image_hash(os.path.join(datasetV1_folder, img)) in datasetV2_hashes]

    for image_name in images_to_remove:
        os.remove(os.path.join(datasetV1_folder, image_name))
        if image_name in data:
            del data[image_name]

    with open(json_file, 'w') as f:
        json.dump(data, f, indent=4)

datasetV1_folder = "filteredImage"
datasetV2_folder = "datasetV1"
json_file = "filtered_captions.json"

remove_identical_images(datasetV1_folder, datasetV2_folder, json_file)

In [None]:
# Remove similar pseudo_caption
# If necessary

from difflib import SequenceMatcher

def is_similar(a, b, threshold=0.8):
    return SequenceMatcher(None, a, b).ratio() > threshold

def remove_duplicates(file_path, output_path):
    captions = {}
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            image_id, caption = line.strip().split('\t', 1)
            if not any(is_similar(caption, existing) for existing in captions.values()):
                captions[image_id] = caption

    with open(output_path, 'w', encoding='utf-8') as file:
        for image_id, caption in captions.items():
            file.write(f"{image_id}\t{caption}\n")

input_file = "caption/pseudo_caption.txt"  
output_file = "caption/pseudo_captionV1.txt"
remove_duplicates(input_file, output_file)


In [None]:
# Unique caption only filter in json file
# If necessary

import json

with open("filtered_captions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

unique_values = {}
for key, value in data.items():
    if value not in unique_values.values():
        unique_values[key] = value

with open("filtered_captionsV1.json", "w", encoding="utf-8") as f:
    json.dump(unique_values, f, indent=4)

In [None]:
# Show unique in txt file based on json file
# If necessary

import json

with open("filtered_captions.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

with open("pseudo_caption.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

unique_lines = [line for line in lines if line.split("\t", 1)[1].strip() not in json_data.values()]

with open("pseudo_caption.txt", "w", encoding="utf-8") as f:
    f.writelines(unique_lines)

In [None]:
# Remove json not in image
# If necessary

import os
import json

with open("filtered_captions.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

dataset_folder = "datasetV1"

image_names = set(os.listdir(dataset_folder))

updated_json_data = {key: value for key, value in json_data.items() if key in image_names}

with open("filtered_captionsV1.json", "w", encoding="utf-8") as f:
    json.dump(updated_json_data, f, indent=4)
