#Installation

In [None]:
%%capture
!pip install torch unsloth huggingface_hub

#Data prep

####Setup

In [None]:
# Enter your dataset name
dataset_to_process = "AIMLOps-C4-G16/indian_classical_dances_resized_sampled"

In [None]:
from huggingface_hub import HfApi, hf_hub_download
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')
hf_api = HfApi(token=hf_token)

In [None]:
info = hf_api.dataset_info(dataset_to_process)
images = [f.rfilename for f in info.siblings if f.rfilename.endswith(".jpg")]

In [None]:
#images[:10]

['images/04HYj7dpoIh2xKrB.jpg',
 'images/0BO7aMqQRGsSpKRw.jpg',
 'images/0DTmIH5VtRUihdfo.jpg',
 'images/0HzsdqFeRpxlhaVj.jpg',
 'images/0KJmZppw7He8CLWB.jpg',
 'images/0SUySBnmSMVljXE6.jpg',
 'images/0hrxvDAMb8N9sdU4.jpg',
 'images/0htEHTM3gHktm4X8.jpg',
 'images/0iaMX4ynFxyyhqr2.jpg',
 'images/0qkiNfIh08a92iCl.jpg']

In [None]:
#len(images)

561

####Sample images from the main dataset

In [None]:
num_samples = 150

In [None]:
import math
import os
import PIL
import random
import tempfile

In [None]:
if len(images) > num_samples:
    images = random.sample(images, num_samples)

In [None]:
#len(images)

150

####Resize images

In [None]:
!rm -r images_resized_sampled
!mkdir -p images_resized_sampled

processed_images_folder = 'images_resized_sampled/'

rm: cannot remove 'images_resized_sampled': No such file or directory


In [None]:
with tempfile.TemporaryDirectory() as tmpdir:
    for filename in images:
        img_file = hf_hub_download(
            repo_id=dataset_to_process,
            filename=filename,
            repo_type="dataset",
            cache_dir=tmpdir,
            local_dir_use_symlinks=False,
            token=hf_token
        )

        img = PIL.Image.open(img_file)
        w, h = img.size

        w_new, h_new = w, h
        if w * h > 150_000:
            reduction_factor = math.sqrt(w * h / 150_000)

            w_new = int(w / reduction_factor)
            h_new = int(h * w_new / w)

            img.resize((w_new, h_new), PIL.Image.Resampling.LANCZOS)
            img = img.convert('RGB')

        save_folder = processed_images_folder + '/'.join(filename.split('/')[:-1]) + '/'
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)

        img.save(save_folder + filename.split('/')[-1])

01b61dd04dce43d3.jpg:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

0acd4ec5e7744304.jpg:   0%|          | 0.00/119k [00:00<?, ?B/s]

0b498c9b230040a3.jpg:   0%|          | 0.00/609k [00:00<?, ?B/s]

0be32e8756a04250.jpg:   0%|          | 0.00/884k [00:00<?, ?B/s]

0ce3440aa98c4b52.jpg:   0%|          | 0.00/108k [00:00<?, ?B/s]

0d2d1870fe394998.jpg:   0%|          | 0.00/142k [00:00<?, ?B/s]

0db73e2acba4487a.jpg:   0%|          | 0.00/84.6k [00:00<?, ?B/s]

0dff5e6411e14832.jpg:   0%|          | 0.00/145k [00:00<?, ?B/s]

0e2c2ad04d434ba1.jpg:   0%|          | 0.00/35.0k [00:00<?, ?B/s]

0e658e2278444e09.jpg:   0%|          | 0.00/141k [00:00<?, ?B/s]

1c6f67199cf24bcf.jpg:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

1e6c19b184e64eee.jpg:   0%|          | 0.00/49.9k [00:00<?, ?B/s]

1e7ac604e93b4cc5.jpg:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

1e8f42747d6945ba.jpg:   0%|          | 0.00/341k [00:00<?, ?B/s]

1feac7f916e34b8c.jpg:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

2a342daec2094206.jpg:   0%|          | 0.00/92.0k [00:00<?, ?B/s]

2b22a920b2124d12.jpg:   0%|          | 0.00/324k [00:00<?, ?B/s]

2ccca508f6704013.jpg:   0%|          | 0.00/247k [00:00<?, ?B/s]

3b2c237b12f24d79.jpg:   0%|          | 0.00/101k [00:00<?, ?B/s]

3b890c187e50475e.jpg:   0%|          | 0.00/222k [00:00<?, ?B/s]

####Upload processed images to new dataset

In [None]:
from huggingface_hub import create_repo, upload_folder

processed_dataset_name = dataset_to_process + '_resized_sampled'

create_repo(
    repo_id=processed_dataset_name,
    repo_type="dataset",
    token=hf_token
)

upload_folder(
    repo_id=processed_dataset_name,
    folder_path = processed_images_folder,
    repo_type="dataset",
    commit_message=f"uploading resized versions of {num_samples} images sampled from {dataset_to_process}",
    token=hf_token
)

Uploading...:   0%|          | 0.00/3.48M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled/commit/404320bc78f4cc6daaa0204b0eed4437d9e44898', commit_message='uploading resized versions of 150 images sampled from AIMLOps-C4-G16/indian_traditional_food_small', commit_description='', oid='404320bc78f4cc6daaa0204b0eed4437d9e44898', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled'), pr_revision=None, pr_num=None)

In [None]:
# Optional: Zip the images and save to disk

%%capture
!cd images_resized_sampled && zip -r ../images_resized_sampled.zip *

#Model

####Setup

In [None]:
import torch
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-11B-Vision-Instruct",
    max_seq_length=2048,
    load_in_4bit=True
)

FastLanguageModel.for_inference(model)

messages = [{
    "role": "user",
    "content": [
        {"type": "image"},
        {"type": "text", "text": "In a short sentence, briefly describe what you see in this image."}
    ]
}]

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.5: Fast Mllama patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_caption(image_file):
    image = PIL.Image.open(image_file)

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens = False,
        return_tensors = "pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.8, top_p=0.9)
        caption = tokenizer.decode(outputs[0]).split('assistant<|end_header_id|>')[1].split('<|eot_id|>')[0].strip()
        return caption.replace('\n', '')

####Perform inference

In [None]:
num_captions_per_image = 4

In [None]:
processed_images_folder = 'images_resized_sampled/'

dataset_to_process = "AIMLOps-C4-G16/indian_traditional_food_small"
processed_dataset_name = dataset_to_process + '_resized_sampled'

In [None]:
def dirpaths(directory):
    for dirpath, _, filenames in os.walk(directory):
        for f in filenames:
            yield os.path.join(dirpath, f)

In [None]:
images_to_caption = dirpaths(processed_images_folder)

In [None]:
captions_filename = f'{processed_dataset_name.split("/")[-1]}.generated_captions.txt'

In [None]:
# This step should take ~1 hour for 150 images on an L4 instance, and roughly 1 hour 30 minutes on a T4
with open(captions_filename, 'w') as _file:
    for image_file in images_to_caption:
        captions = []
        for i in range(num_captions_per_image):
            captions.append(generate_caption(image_file))

        image_filename = '/'.join(image_file.split('/')[1:])
        _file.write('\t'.join([image_filename] + captions) + '\n')

####Upload generated captions to HF dataset

In [None]:
from huggingface_hub import upload_file

upload_file(
    repo_id=processed_dataset_name,
    path_or_fileobj = captions_filename,
    path_in_repo = captions_filename,
    repo_type="dataset",
    commit_message=f"uploading generated captions",
    token=hf_token
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled/commit/7f7d8c742de3ea7abd8873425c68bf50c8849ade', commit_message='uploading generated captions', commit_description='', oid='7f7d8c742de3ea7abd8873425c68bf50c8849ade', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled'), pr_revision=None, pr_num=None)

#Annnotation loop - no GPU required

####Setup

In [None]:
%%capture
!unzip images_resized_sampled.zip -d images_resized_sampled

In [None]:
# Please make sure that the images are present in the images_resized_sampled/ folder as
# generated by the previous code (images may be in subfolders) and that the captions file:
# '<dataset_resized_sampled>.generated_captions.txt' is also present

dataset_to_process = "AIMLOps-C4-G16/indian_traditional_food_small"
processed_dataset_name = dataset_to_process + '_resized_sampled'
captions_filename = f'{processed_dataset_name.split("/")[-1]}.generated_captions.txt'
annotations_filename = f'{processed_dataset_name.split("/")[-1]}.annotations.txt'

processed_images_folder = 'images_resized_sampled/'

In [None]:
from matplotlib import pyplot as plt
import PIL

from IPython.display import clear_output
import time


screen_width = 100

def print_data(img, caps):
    plt.axis('off')
    plt.imshow(img, aspect='auto')
    plt.show()
    print()

    for i, cap in enumerate(caps, start=1):
        line = f'[{i}]'
        for w in cap.split(' '):
            if len(line) + len(w) > screen_width:
                print(line)
                line = '    ' + w
            else:
                line = line + ' ' + w
        if line:
            print(line)

        print()


def run_annotation_loop(start_from_line=1):
    lines = []
    with open(captions_filename, 'r') as f:
        lines.extend(f.readlines())

    output = []

    with open(annotations_filename, 'a') as f:
        for line in lines[start_from_line - 1:]:

            cols = line.strip().split('\t')
            img = PIL.Image.open(processed_images_folder + cols[0])
            caps = cols[1:]

            print_data(img, caps)

            time.sleep(.5)
            inp = ''
            while inp not in ['0', '1', '2', '3', '4']:
                inp = input("Best caption? (1, 2, 3, 4 or 0): ")

            clear_output()

            oline = '\t'.join([cols[0]] + caps + [inp])

            output.append(oline)
            f.write(oline+'\n')

    return output


####Generate annotations

In [None]:
# Please ensure that the annotations file:
# '<dataset_resized_sampled>.annotations.txt' is either NOT present before running this
# OR
# if you are setting `start_from_line = n` then ensure that
# the existing annotation file currently has `n - 1` lines of data

output = run_annotation_loop(start_from_line=1)

####Upload annotations to HF dataset

In [None]:
from huggingface_hub import HfApi, hf_hub_download
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')

In [None]:
from huggingface_hub import upload_file

upload_file(
    repo_id=processed_dataset_name,
    path_or_fileobj = annotations_filename,
    path_in_repo = annotations_filename,
    repo_type="dataset",
    commit_message=f"uploading annotations",
    token=hf_token
)

CommitInfo(commit_url='https://huggingface.co/datasets/AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled/commit/f979aa8fec46b8654ef549c0a1dddb406eb5a614', commit_message='uploading annotations', commit_description='', oid='f979aa8fec46b8654ef549c0a1dddb406eb5a614', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AIMLOps-C4-G16/indian_traditional_food_small_resized_sampled'), pr_revision=None, pr_num=None)