In [None]:
import json
import os
import random
random.seed(42)

In [None]:
def sample_and_split_annotations(annotation_path, test_size, test_output_path, train_output_path):
    # Load annotations
    with open(annotation_path, 'r') as f:
        annotations = json.load(f)

    # Ensure the dataset is large enough
    if len(annotations) < test_size:
        raise ValueError("The dataset is smaller than the requested test size.")

    # Sample 5000 pairs for the test set
    test_set = random.sample(annotations, test_size)

    # Create the training set by removing the test set samples from the original annotations
    test_set_ids = {ann['image_id'] for ann in test_set}
    train_set = [ann for ann in annotations if ann['image_id'] not in test_set_ids]

    # Save the test set
    with open(test_output_path, 'w') as f:
        json.dump(test_set, f, indent=4)

    # Save the training set
    with open(train_output_path, 'w') as f:
        json.dump(train_set, f, indent=4)

In [None]:
path_2 = "/project/Deep-Clustering/data/redcaps_plus/redcaps.json"
test_output_path = '/project/Deep-Clustering/data/redcaps_plus/redcaps_test.json'
train_output_path = '/project/Deep-Clustering/data/redcaps_plus/redcaps_train.json'
test_size = 5000

In [None]:
# sample_and_split_annotations(path_2, test_size, test_output_path, train_output_path)

In [None]:
# # Create small, medium and full training set
# redcaps_annotation = json.load(open(train_output_path))

# redcaps_small = random.sample(redcaps_annotation, int(len(redcaps_annotation) * 0.1))
# redcaps_medium = random.sample(redcaps_annotation, int(len(redcaps_annotation) * 0.5))

# # Save the small training set
# with open('/project/Deep-Clustering/data/redcaps_plus/redcaps_train_s.json', 'w') as f:
#     json.dump(redcaps_small, f, indent=4)

# # Save the medium training set
# with open('/project/Deep-Clustering/data/redcaps_plus/redcaps_train_m.json', 'w') as f:
#     json.dump(redcaps_medium, f, indent=4)

In [None]:
def transform_first_type_testset(first_test_path):
    with open(first_test_path, 'r') as f:
        first_test_set = json.load(f)
    
    transformed_first_test_set = []
    for entry in first_test_set:
        image_path = entry['image']
        
        captions = entry['caption'][:5]
        for caption in captions:
            transformed_first_test_set.append({
                'image': os.path.join("coco/images", image_path),
                'caption': caption
            })
    
    return transformed_first_test_set

In [None]:
def combine_testsets(first_transformed_testset, second_testset, output_path):
    combined_test_set = first_transformed_testset + second_testset
    with open(output_path, 'w') as f:
        json.dump(combined_test_set, f, indent=4)

## MSCOCO


In [None]:
path_1 = "/data/SSD/coco/annotations/coco_karpathy_test.json"
combined_test_output_path = '/project/Deep-Clustering/data/redcaps_plus/redcoco_test.json'

In [None]:
transformed_first_test_set = transform_first_type_testset(path_1)

# Load the second test set
with open(test_output_path, 'r') as f:
    second_test_set = json.load(f)

In [None]:
# Combine the test sets
combine_testsets(transformed_first_test_set, second_test_set, combined_test_output_path)

In [None]:
# Combine mscoco and redcaps_train
# Load the mscoco training set

def load_first_type(annotation_path, prefix):
    # Load mscoco style annotation
    with open(annotation_path) as f:
        annotations = json.load(f)
        for annotation in annotations:
            annotation['image'] = os.path.join(prefix, annotation['image'])
    return annotations

mscoco_train = load_first_type('/data/SSD/coco/annotations/coco_karpathy_train.json', 'coco/images')
    

# Load the redcaps training set
with open('/project/Deep-Clustering/data/redcaps_plus/redcaps_train.json', 'r') as f:
    redcaps_train = json.load(f)
    
# Combine the two datasets
combined_train = mscoco_train + redcaps_train

# Save the combined dataset
with open('/project/Deep-Clustering/data/redcaps_plus/redcoco_train.json', 'w') as f:
    json.dump(combined_train, f)

In [None]:
combined_train[0]

In [None]:
path_1 = "/data/SSD/coco/annotations/coco_karpathy_test.json"
combined_test_output_path = '/project/Deep-Clustering/data/redcaps_plus/redcoco_test.json'

In [None]:
transformed_first_test_set = transform_first_type_testset(path_1)

# Load the second test set
with open(test_output_path, 'r') as f:
    second_test_set = json.load(f)

In [None]:
# Combine the test sets
combine_testsets(transformed_first_test_set, second_test_set, combined_test_output_path)

In [None]:
# Combine mscoco and redcaps_train
# Load the mscoco training set
with open('/data/SSD/coco/annotations/coco_karpathy_train.json', 'r') as f:
    mscoco_train = json.load(f)
    
# Load the redcaps training set
with open('/project/Deep-Clustering/data/redcaps_plus/redcaps_train.json', 'r') as f:
    redcaps_train = json.load(f)
    
# Combine the two datasets
combined_train = mscoco_train + redcaps_train

# Save the combined dataset
with open('/project/Deep-Clustering/data/redcaps_plus/redcoco_train.json', 'w') as f:
    json.dump(combined_train, f)

## Flickr30k


In [30]:
def transform_first_type_testset_1(first_test_path):
    with open(first_test_path, 'r') as f:
        first_test_set = json.load(f)
    
    transformed_first_test_set = []
    for entry in first_test_set:
        image_path = entry['image']
        
        captions = entry['caption'][:5]
        for caption in captions:
            transformed_first_test_set.append({
                'image': os.path.join("flickr30k/images", image_path),
                'caption': caption
            })
    
    return transformed_first_test_set

def transform_first_type_testset(first_test_path):
    # Transform 1v5 cases
    with open(first_test_path, 'r') as f:
        first_test_set = json.load(f)
    
    transformed_first_test_set = []
    for entry in first_test_set:
        image_path = entry['image']
        
        captions = entry['caption'][:5]
        
        transformed_first_test_set.append({
                'image': os.path.join("flickr30k/images", image_path),
                'caption': captions
            })
    
    return transformed_first_test_set

In [31]:
path_1 = "/data/SSD/flickr30k/annotations/test.json"

combined_test_output_path = '/project/Deep-Clustering/data/redcaps_plus/redflickr_test_flickr.json'
combined_test_output_path_2 = '/project/Deep-Clustering/data/redcaps_plus/redflickr_test_redcaps.json'
combined_test_output_path_3 = '/project/Deep-Clustering/data/redcaps_plus/redflickr_test_hybrid.json'

In [32]:
transformed_first_test_set = transform_first_type_testset(path_1)
# Dump the transformed test set
json.dump(transformed_first_test_set, open(combined_test_output_path, 'w'), indent=4)

# Load the second test set
with open(test_output_path, 'r') as f:
    second_test_set = json.load(f)[:5000]
    
# dump the second test set
json.dump(second_test_set, open(combined_test_output_path_2, 'w'), indent=4)

In [33]:
# Combine the test sets
combine_testsets(transformed_first_test_set, second_test_set, combined_test_output_path_3)

In [None]:
# Combine mscoco and redcaps_train
# Load the mscoco training set

def load_first_type(annotation_path, prefix):
    # Load mscoco style annotation
    with open(annotation_path) as f:
        annotations = json.load(f)
        for annotation in annotations:
            annotation['image'] = os.path.join(prefix, annotation['image'])
    return annotations

mscoco_train = load_first_type('/data/SSD/flickr30k/annotations/train.json', 'flickr30k/images')

mscoco_train = mscoco_train[:5000]

# Load the redcaps training set
with open('/project/Deep-Clustering/data/redcaps_plus/redcaps_train.json', 'r') as f:
    redcaps_train = json.load(f)[:5000]
    
# Combine the two datasets
combined_train = mscoco_train + redcaps_train

# Save the combined dataset
with open('/project/Deep-Clustering/data/redcaps_plus/redflickr_train_mini.json', 'w') as f:
    json.dump(combined_train, f, indent=4)

In [None]:
combined_train[0]