In [2]:
# Importing useful dependencies
import boto3


In [4]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [5]:
# We create a new Bucket in Min-IO to store our augmented data

# List existing buckets
buckets = [b["Name"] for b in s3.list_buckets()["Buckets"]]

# Function that given a name, creates a bucket
def createBucket(name, list_buckets):
    if name in list_buckets:
        print(f"Bucket '{name}' already exists!")
    else:
        s3.create_bucket(Bucket=name)
        print(f"Created bucket: {name}")

# Create a bucket named landing_zone
createBucket("training-data-construction-zone", buckets)
# Sub-bucket: Baseline Training Data
s3.put_object(Bucket="training-data-construction-zone", Key="paired_multimodal/")

Bucket 'training-data-construction-zone' already exists!


{'ResponseMetadata': {'RequestId': '187A2773AFD1537A',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '187A2773AFD1537A',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '2109',
   'x-ratelimit-remaining': '2109',
   'x-xss-protection': '1; mode=block',
   'date': 'Fri, 21 Nov 2025 22:53:02 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

In [62]:
mapping = {
    "delete_aug" : "crop",
    "swap_spelling_aug" : "y_rotate" ,
    "swap_sym_word_aug" : "x_rotate"
}
def mapping_augmented_multimodal_data(bucket, src_prefix1="texts/", src_prefix2="images/",dest_prefix="paired_multimodal/"):

    # Incremental id assigned to each image-text pair
    id_counter = 0

    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=bucket, Prefix=src_prefix1):
        
        for obj in page.get("Contents", []):
            
            key = obj["Key"]
            
            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            part = key.replace(".txt", "").split('_', 2)

            image_key = "image_"+part[1]

            if (len(part) > 2):
                image_key += "_" + mapping[part[2]]

            image_key += ".png"
            
            new_key_text = dest_prefix + "text_" + str(id_counter).zfill(6) + ".txt"
            new_key_image = dest_prefix + "image_" + str(id_counter).zfill(6) + ".png"
            resp = s3.get_object(Bucket=bucket, Key=key)
            body = resp["Body"].read()
            respimg = s3.get_object(Bucket=bucket, Key=src_prefix2 + image_key)
            bodyimg = respimg["Body"]
            s3.put_object(Bucket=bucket,Key=new_key_text,Body=body,ContentType="text/plain")
            s3.upload_fileobj(bodyimg,Bucket=bucket, Key=new_key_image, ExtraArgs={"ContentType": "image/png"})
        
            print(f"✅ Augmented pair #{id_counter} created successfully. Sources: {key} and {src_prefix2 +image_key} to Target: {new_key_text} and {new_key_image}")

    print(f"✅ All augmented pairs have been successfully created.")  
        


In [None]:
mapping_augmented_multimodal_data("training-data-construction-zone")

✅ Augmented pair #1 created successfully. Sources: texts/text_000001.txt and images/image_000001.png to Target: paired_multimodal/text_000001.txt and paired_multimodal/image_000001.png
✅ Augmented pair #2 created successfully. Sources: texts/text_000001_delete_aug.txt and images/image_000001_crop.png to Target: paired_multimodal/text_000002.txt and paired_multimodal/image_000002.png
✅ Augmented pair #3 created successfully. Sources: texts/text_000001_swap_spelling_aug.txt and images/image_000001_y_rotate.png to Target: paired_multimodal/text_000003.txt and paired_multimodal/image_000003.png
✅ Augmented pair #4 created successfully. Sources: texts/text_000001_swap_sym_word_aug.txt and images/image_000001_x_rotate.png to Target: paired_multimodal/text_000004.txt and paired_multimodal/image_000004.png
✅ Augmented pair #5 created successfully. Sources: texts/text_000002.txt and images/image_000002.png to Target: paired_multimodal/text_000005.txt and paired_multimodal/image_000005.png
✅ Aug