In [25]:
#!pip install nlpaug
#!pip install nltk

Collecting nltk


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip



  Obtaining dependency information for nltk from https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl.metadata
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------- ------------------------------- 0.3/1.5 MB 6.3 MB/s eta 0:00:01
   ------------------------------ --------- 1.1/1.5 MB 10.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 10.7 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.2


In [4]:
# Importing useful dependencies
import boto3
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import re
from nlpaug.util import Action
import nltk
from nltk.corpus import wordnet
import torch
import open_clip
import numpy as np

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [3]:
# We create a new Bucket in Min-IO to store our augmented data

# List existing buckets
buckets = [b["Name"] for b in s3.list_buckets()["Buckets"]]

# Function that given a name, creates a bucket
def createBucket(name, list_buckets):
    if name in list_buckets:
        print(f"Bucket '{name}' already exists!")
    else:
        s3.create_bucket(Bucket=name)
        print(f"Created bucket: {name}")

# Create a bucket named landing_zone
createBucket("augmentation-zone", buckets)
# Sub-bucket: Baseline Training Data
s3.put_object(Bucket="augmentation-zone", Key="texts/")

Bucket 'augmentation-zone' already exists!


{'ResponseMetadata': {'RequestId': '187915C0DA2AB6D2',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '187915C0DA2AB6D2',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '2110',
   'x-ratelimit-remaining': '2110',
   'x-xss-protection': '1; mode=block',
   'date': 'Tue, 18 Nov 2025 11:17:27 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

In [5]:
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text

In [6]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
model, _, _ = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
tokenizer = open_clip.get_tokenizer("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K") # Tokenizer for texts
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwi

In [7]:
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(model, tokenizer, texts: str):
    tokens = tokenizer([texts]).to(device) # tokenized batch
    feats = model.encode_text(tokens)
    feats = feats / feats.norm(dim=-1, keepdim=True) # normalize
    return feats.cpu().numpy()[0]

In [48]:
#Generate an augmented version of original text and save the reuslt to S3
def generate_and_save_augmented(
    aug,
    suffix: str,
    body: str,
    embedded_orig: np.ndarray,
    model_text,
    tokenizer,
    dest_bucket: str,
    dest_prefix: str,
    new_key_infix: str,
    max_try: int = 10,
    min_sim_accept: float = 0.9,
    max_sim_accept: float = 0.99,
):
    best_text = None
    best_sim = -1.0

    for attempt in range(max_try):
        # 1. Generate augmented text
        #    For a single string input, aug.augment(...) returns a string.
        aug_text = aug.augment(body)

        # 2. Compute cosine similarity with the original text embedding
        emb_aug = embed_text(model_text, tokenizer, aug_text)
        sim = float(np.dot(embedded_orig, emb_aug))
        print(f"[{suffix}] attempt {attempt + 1}, sim={sim:.4f}")

        # Track the best candidate regardless of whether it is in the range
        if sim > best_sim:
            best_sim = sim
            best_text = aug_text

        # If similarity is within the acceptable range, save and return early
        if min_sim_accept <= sim <= max_sim_accept:
            augment_key = f"{dest_prefix}{new_key_infix}_{suffix}.txt"
            s3.put_object(
                Bucket=dest_bucket,
                Key=augment_key,
                Body=aug_text.encode("utf-8"),
                ContentType="text/plain",
            )
            return best_sim

    # If no candidate falls into [min_sim_accept, max_sim_accept],
    # use the best candidate found as a fallback.
    if best_text is not None:
        augment_key = f"{dest_prefix}{new_key_infix}_{suffix}.txt"
        print(f"[{suffix}] use best_sim={best_sim:.4f} as fallback")
        s3.put_object(
            Bucket=dest_bucket,
            Key=augment_key,
            Body=best_text.encode("utf-8"),
            ContentType="text/plain",
        )

    return best_sim

In [53]:
def text_augmentation(src_bucket, dest_bucket, model_text, tokenizer, dest_prefix="texts/"):
    # Incremental id assigned to each image-text pair
    id_counter = 0
    stopwords = [ 'Epics', 'Steam'] # don't change name of plataform
    stopwords_regex = r'\d+(\.\d+)?'  # don't change numbers
    augsim = naw.SynonymAug(
        aug_src='wordnet',
        lang='eng',
        aug_p=0.3,
        aug_min=5,
        aug_max=20, 
        stopwords=stopwords,
        stopwords_regex=stopwords_regex
    )
    
    spelling_aug = naw.SpellingAug()
    augswap = naw.RandomWordAug(action="swap")
    swap_sym_word_aug = nafc.Sequential([augswap,augsim])
    delete_aug = naw.RandomWordAug()
    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix="baseline-training-data/"):

        for obj in page.get("Contents", []):
            key = obj["Key"]

            if "text" in key:
                body = get_text(src_bucket, key)
                embedded = embed_text(model_text,tokenizer,body)
                new_key_infix = key.split("/")[1].split(".")[0]
                new_key = dest_prefix + new_key_infix + ".txt"
                copy_source_text = {"Bucket": src_bucket, "Key": key}
                s3.copy_object(Bucket=dest_bucket, Key=new_key, CopySource=copy_source_text)
                # 1) word-level insert spelling error
                generate_and_save_augmented(
                    aug=spelling_aug,
                    suffix="spelling_aug",
                    body=body,
                    embedded_orig=embedded,
                    model_text=model_text,
                    tokenizer=tokenizer,
                    dest_bucket=dest_bucket,
                    dest_prefix=dest_prefix,
                    new_key_infix=new_key_infix,
                    max_try=10,
                    min_sim_accept=0.9,
                    max_sim_accept=0.99,
                )
    
                # 2) Word-level delete
                generate_and_save_augmented(
                    aug=delete_aug,
                    suffix="delete_aug",
                    body=body,
                    embedded_orig=embedded,
                    model_text=model_text,
                    tokenizer=tokenizer,
                    dest_bucket=dest_bucket,
                    dest_prefix=dest_prefix,
                    new_key_infix=new_key_infix,
                    max_try=10,
                    min_sim_accept=0.9,
                    max_sim_accept=0.99,
                )
    
                # 3) Word-level swap + synonym
                generate_and_save_augmented(
                    aug=swap_sym_word_aug,
                    suffix="swap_sym_word_aug",
                    body=body,
                    embedded_orig=embedded,
                    model_text=model_text,
                    tokenizer=tokenizer,
                    dest_bucket=dest_bucket,
                    dest_prefix=dest_prefix,
                    new_key_infix=new_key_infix,
                    max_try=10,
                    min_sim_accept=0.9,
                    max_sim_accept=0.99,
                )

                

In [54]:
text_augmentation(src_bucket = "training-data-construction-zone", dest_bucket = "augmentation-zone", model_text = model, tokenizer=tokenizer)

AttributeError: 'list' object has no attribute 'find'