# Generation

In [13]:
# import libraries

import kagglehub
import os
import random
import json

from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [2]:
# load instagram posts

dataset = kagglehub.dataset_download('thecoderenroute/instagram-posts-dataset')

In [3]:
# load BLIP model

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
def select_social_media_image() -> Image:
    '''
    Selects a random image from the dataset and display it
    '''
    main_folder = f"{dataset}\\Data"
    jpg_files = []
    for root, _, files in os.walk(main_folder): 
        for file in files:
            if file.lower().endswith(".jpg"): 
                jpg_files.append(os.path.join(root, file)) 
    if not jpg_files:
        print("No JPG files found in the folder.")
    else:
        image_path = random.choice(jpg_files)
        raw_image = Image.open(image_path).convert('RGB')
        return raw_image

In [None]:
def generate_caption(image : Image) -> str:
    '''
    Generates a caption for the image using the BLIP model
    '''
    text = 'a photograph of'
    inputs = processor(image, text, return_tensors="pt")
    temperature = 0.3
    top_p = 0.9
    outputs = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=50, do_sample=True, repetition_penalty=1.2)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

In [10]:
# generate many captions

captions = []
num_captions = 1000

for _ in range(num_captions):
    image = select_social_media_image()
    caption = generate_caption(image)
    print(caption)
    captions.append(caption)

a photograph of a woman in green pants and a white jacket
a photograph of two men in blue uniforms standing on top of a field
a photograph of a woman is doing yoga on a mat
a photograph of an image of a man sitting in front of a booth
a photograph of two women are holding up wine glasses in the air
a photograph of a woman is drinking coffee and making the peace sign
a photograph of three men laying on the ground with one man holding his leg
a photograph of a woman in a red bikini walking on the beach
a photograph of two women sitting in the back seat of a car
a photograph of two men are sitting on the ground in front of a stadium
a photograph of a group of people posing for a picture
a photograph of a plate with four different types of food on it
a photograph of a man in a red shirt is smiling
a photograph of a woman in a black dress posing for the camera
a photograph of a woman in a pink dress and jewelry
a photograph of indian cricket team celebrating after winning the match
a photog

In [11]:
# data folder

data_folder = '../../data/social_media'

In [14]:
# save captions

with open(f"{data_folder}/captions.json", 'w', encoding='utf-8') as file:
    json.dump(captions, file, indent=4)