In [None]:
%pip install torch torchvision pandas scikit-learn open_clip_torch[training]

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import subprocess
import torch
INDEX_CSV = 'index.csv'     # existing manual captions
TRAIN_ORIGINAL_CSV = 'train_original.csv'
AUGMENTED_TRAIN = "train.csv"
VAL_CSV = 'val.csv'


In [2]:
df = pd.read_csv(INDEX_CSV)
print(f"Total samples: {len(df)}")
df.head()
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train samples: {len(train_df)} | Validation samples: {len(val_df)}")


Total samples: 16
Train samples: 12 | Validation samples: 4


In [3]:
train_df.to_csv(TRAIN_ORIGINAL_CSV, index=False)
val_df.to_csv(VAL_CSV, index=False)
print(f"Saved {TRAIN_ORIGINAL_CSV} and {VAL_CSV}")
df_train = pd.read_csv(TRAIN_ORIGINAL_CSV)
df_val = pd.read_csv(VAL_CSV)

print("Columns in  index.csv: ",df.columns.tolist())
print("Columns in train_original.csv:", df_train.columns.tolist())
print("Columns in val.csv:", df_val.columns.tolist())

Saved train_original.csv and val.csv
Columns in  index.csv:  ['filepath', 'caption']
Columns in train_original.csv: ['filepath', 'caption']
Columns in val.csv: ['filepath', 'caption']


In [4]:
print("installing local llm (llama-cpp-python) dependencies..")
%pip install llama-cpp-python huggingface_hub --quiet
print("Dependencies done")

installing local llm (llama-cpp-python) dependencies..
Note: you may need to restart the kernel to use updated packages.
Dependencies done




In [5]:
device = "cuda" if torch.cuda.is_available else "cpu"
print(device)
CUDA_PATH = os.environ.get("CUDA_PATH")
print(CUDA_PATH)

cuda
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6


In [6]:
#manual augment (type synonyms for captions)
#print("\n--- Running Manual Synonym Augmentation ---")
# !python text_augment.py --augmentation_method manual --num_aug_per_original 5

#llm augmentation
print("\n--- Running LLM-based Augmentation ---")
!python text_augment.py --augmentation_method llm --num_aug_per_original 3

train_augmented_df = pd.read_csv(AUGMENTED_TRAIN)
print(f"Final train.csv has {len(train_augmented_df)} entries.")
print(train_augmented_df.head())


--- Running LLM-based Augmentation ---
LLM model 'Phi-3-mini-4k-instruct-q4.gguf' already exists at local_llm_models\Phi-3-mini-4k-instruct-q4.gguf.
Local LLM 'Phi-3-mini-4k-instruct-q4.gguf' loaded successfully.
Starting text augmentation using 'llm' method...
Generated 2 paraphrases for: 'church x3'
Generated 2 paraphrases for: 'street at night'
Generated 2 paraphrases for: 'standing under lamp'
Generated 2 paraphrases for: 'church blurry'
Generated 2 paraphrases for: 'me again'
Generated 2 paraphrases for: 'sky at university'
Generated 2 paraphrases for: 'joel sitting on a rock at night'
Generated 2 paraphrases for: 'boat during night'
Generated 2 paraphrases for: 'andother church'
Generated 2 paraphrases for: 'henry'
Generated 2 paraphrases for: 'me starting down'
Generated 2 paraphrases for: 'standing under lamp'
Text augmentation complete. Saved augmented train data to train.csv with 36 entries.
Final train.csv has 36 entries.
               filepath                             

llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


In [None]:
aug_cfg_args = [
    "use_timm=True",
    "scale=(0.4,1.0)",
    "ratio=(0.75,1.3333333333333333)",
    "color_jitter=(0.4,0.4,0.4,0.1)",
    "color_jitter_prob=0.8",
    "re_prob=0.25",
    "re_count=1",
    "gray_scale_prob=0.2"
]



cmd = [
    "python", "-m", "open_clip_train.main",
    "--train-data", "train.csv",
    "--val-data", "val.csv",
    "--csv-img-key", "filepath",
    "--csv-caption-key", "caption",
    "--csv-separator", ",",
    "--model", "ViT-B-32",
    "--pretrained", "openai",
    "--report-to", "tensorboard",
    "--log-every-n-steps", "50",
    "--batch-size", "32",
    "--lr", "1e-4",
    "--epochs", "10",
    "--warmup", "10000",
    "--workers", "4",
    "--device", device,
    "--aug-cfg"
] + aug_cfg_args

# Print the command for verification
print("COMMAND:")
print(" ".join(cmd))

# Run it
result = subprocess.run(cmd, capture_output=True, text=True)

# Output logs
print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)
print("Exit code:", result.returncode)

COMMAND:
python -m open_clip_train.main --train-data train.csv --val-data val.csv --csv-img-key filepath --csv-caption-key caption --csv-separator , --model ViT-B-32 --pretrained openai --report-to tensorboard --log-every-n-steps 50 --batch-size 32 --lr 1e-4 --epochs 10 --warmup 10000 --workers 4 --device cuda --use_timm true --aug-cfg scale=(0.4,1.0) ratio=(0.75,1.3333333333333333) color_jitter=(0.4,0.4,0.4,0.1) color_jitter_prob=0.8 re_prob=0.25 re_count=1 gray_scale_prob=0.2
STDOUT:
 
STDERR:
 usage: main.py [-h] [--train-data TRAIN_DATA]
               [--train-data-upsampling-factors TRAIN_DATA_UPSAMPLING_FACTORS]
               [--val-data VAL_DATA] [--train-num-samples TRAIN_NUM_SAMPLES]
               [--val-num-samples VAL_NUM_SAMPLES]
               [--dataset-type {webdataset,csv,synthetic,auto}]
               [--dataset-resampled] [--csv-separator CSV_SEPARATOR]
               [--csv-img-key CSV_IMG_KEY] [--csv-caption-key CSV_CAPTION_KEY]
               [--imagenet-val IM