# **Loading the image captioning models**
In terms of using which model to use for generating the captions for the image, it was preferred to use LlaVa over ShareGPT because the inference time for LlaVa was smaller than that of ShareGPT and the captions produced from both of the models were of comparable quality.

# LlaVa loader

In [1]:
#cloning the github repo into folder /LlaVA and then installing all the required modules
%cd /content
!git clone -b v1.0 https://github.com/camenduru/LLaVA
%cd /content/LLaVA
!pip install -q gradio .

/content
Cloning into 'LLaVA'...
remote: Enumerating objects: 1960, done.[K
remote: Counting objects: 100% (693/693), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 1960 (delta 615), reused 568 (delta 568), pack-reused 1267[K
Receiving objects: 100% (1960/1960), 13.32 MiB | 18.71 MiB/s, done.
Resolving deltas: 100% (1214/1214), done.
/content/LLaVA
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m33

In [2]:
#importing required libraries
import os
import torch
import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer
from llava.utils import disable_torch_init
from llava.model import LlavaLlamaForCausalLM
from transformers import AutoTokenizer, BitsAndBytesConfig
from llava.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

[2023-12-15 13:26:39,295] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
# model initialization and then loading it in the gpu
model_path = "4bit/llava-v1.5-13b-3GB"
kwargs = {"device_map": "auto"}
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device='cuda')
image_processor = vision_tower.image_processor

# function to generate the captions of the image given the captions of the image given the linnk of the image file and a suitable prompt for image caption generation
def caption_image(image_file, prompt):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    disable_torch_init()
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
      output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2,
                                  max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria])
    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = outputs
    output = outputs.rsplit('</s>', 1)[0]
    return output

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/9 [00:00<?, ?it/s]

pytorch_model-00001-of-00009.bin:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00009.bin:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

pytorch_model-00003-of-00009.bin:   0%|          | 0.00/2.89G [00:00<?, ?B/s]

pytorch_model-00004-of-00009.bin:   0%|          | 0.00/2.96G [00:00<?, ?B/s]

pytorch_model-00005-of-00009.bin:   0%|          | 0.00/2.89G [00:00<?, ?B/s]

pytorch_model-00006-of-00009.bin:   0%|          | 0.00/2.98G [00:00<?, ?B/s]

pytorch_model-00007-of-00009.bin:   0%|          | 0.00/2.87G [00:00<?, ?B/s]

pytorch_model-00008-of-00009.bin:   0%|          | 0.00/2.89G [00:00<?, ?B/s]

pytorch_model-00009-of-00009.bin:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

# ShareGPT loader

In [1]:
#cloning the github repo into folder /InternLM-XComposer and then installing all the required modules
%cd /content
!git clone -b dev https://github.com/camenduru/InternLM-XComposer
!pip install -q lida llmx gradio https://github.com/camenduru/LLaVA/releases/download/v1.1.3/llava-ShareGPT4V-1.1.3-py3-none-any.whl
%cd /content/InternLM-XComposer/projects/ShareGPT4V
!pip install -q gradio .

/content
Cloning into 'InternLM-XComposer'...
remote: Enumerating objects: 657, done.[K
remote: Counting objects: 100% (183/183), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 657 (delta 147), reused 99 (delta 99), pack-reused 474[K
Receiving objects: 100% (657/657), 10.72 MiB | 17.45 MiB/s, done.
Resolving deltas: 100% (363/363), done.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.6/128.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [2]:
#importing required libraries
import os
import torch
import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer
from llava.utils import disable_torch_init
from llava.model import LlavaLlamaForCausalLM
from llava.model.builder import load_pretrained_model
from transformers import AutoTokenizer, BitsAndBytesConfig
from llava.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

In [3]:
#model initialization and loading the ShareGPT model on the GPU
model_path = "4bit/ShareGPT4V-7B-5GB"
kwargs = {"device_map": "auto"}
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)
model_name = "llava-v1.5-7b"
tokenizer, model, image_processor, context_len = load_pretrained_model("4bit/ShareGPT4V-7B-5GB", None, "llava-v1.5-7b", True, False)
vision_tower = model.get_vision_tower()

if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device='cuda')
image_processor = vision_tower.image_processor

# function to generate the captions of the image given the captions of the image given the linnk of the image file and a suitable prompt for image caption generation
def caption_image(image_file, prompt):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    disable_torch_init()
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
      output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2,
                                  max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria])
    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = outputs
    output = outputs.rsplit('</s>', 1)[0]
    return image, output

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/76.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at 4bit/ShareGPT4V-7B-5GB were not used when initializing LlavaLlamaForCausalLM: ['model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_pro

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/607M [00:00<?, ?B/s]

# **Audio Captioning**

For this paticular, task it was determined that using the captions of the audio present in the video or transcribing its audio will not reduce the quality of the output presented by the model to the user.

Installing necessary modules and whisper from the github repo and its initialization

In [4]:
# This part is optional. You can use audio captioning if there is no particular time limit set on the task that you are doing.
!pip install wget
!pip install openai cohere
!pip install git+https://github.com/openai/whisper.git -q

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=0cf1b70b2960649275969896cecf61eb7ccb1f5eac5aea0047d28ab54a63c02e
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting openai
  Downloading openai-1.4.0-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.9/221.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cohere
  Downloading cohere-4.38-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.6/51.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting backoff<3.0,>=2.0 (from cohere)
  Downloading backof

In [5]:
# Importing the modules and whisper initialization.
import whisper
import wget
import os
import requests

# Here the 'tiny' form of whisper is used as its size is less, and the quality is still quite good.
audio_model = whisper.load_model('tiny')

100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 118MiB/s]


# Functions defined

All the function required for the task are defined in a single place for better readability.

In [4]:
# Prompts for generating the captions of the media attached to the tweet.
def prompt(index):
  tweet = data.iloc[index]
  prompt = "This is the content of the tweet that I have \"" + str(tweet['content']) + "\". Give me the caption of the media attached to it with respect to this tweet having at most 150 words."
  return prompt

# Extracting the thumbnail of the video which also serves as the function to extract the images for the photos and GIFs.
def thumb_extract(url):
  return url.split("'")[1]

# This part should be be commentted out in case audio captioning is not being done, only image captions is used.
def extract_link(video_link):
    return video_link.split("'")[-2]

This function downloads the video, gets its path, gives it to whisper, transcribes the video, and then also delete it.
def audio_caption(video_link,name):
    response = wget.download(video_link, f"{name}.mp4")
    path = os.path.abspath(name)
    path = path+".mp4"
    text = audio_model.transcribe(path)
    os.remove(path)
    return text['text']

def caption_media(url):
  prompt = 'Describe the image and color details.'
  try:
    Generating the captions of video media.
    if url.find('Video(') != -1:
      img_cap = caption_image(thumb_extract(url), prompt)
      vid_cap = audio_caption(extract_link(url),"video")
      return "Image describes " + str(img_cap) + " and the video's audio describes" + str(vid_cap)
    Generating the captions of image media.
    if url.find('Photo(') != -1:
      img_cap = caption_image(thumb_extract(url), prompt)
      return "Image describes " + str(img_cap)
    Generating the captions of GIF media.
    if url.find('Gif(') != -1:
      img_cap = caption_image(thumb_extract(url), prompt)
      return "GIF describes " + str(img_cap)
    Exception handling
  except:
    return "There is nothing here"
# Till here should be the commentted part.

# This part is the one used in the actual caption generation for the test data, as it was decided that audio transcribing is not feasible for it, and that image caption generation will suffice for this

def caption_media(url, prompt):
  try:
    img_cap = caption_image(thumb_extract(url), prompt)
    return "Image describes " + str(img_cap)
  except:
    return "No media present"

# Reading the testing file

The excel file had been converted into csv format beforehand using an online tool.

In [None]:
import pandas as pd
df_bc = pd.read_csv('/path/to/the/file/behaviour_simulation_test_company.csv')
df_bt = pd.read_csv('/path/to/the/file/behaviour_simulation_test_time.csv')
df_cc = pd.read_csv('/path/to/the/file/content_simulation_test_company.csv')
df_ct = pd.read_csv('/path/to/the/file/content_simulation_test_time.csv')

# Generating the captions of the media present in the tweet

In [None]:
import time
import csv

# Images were sent in a batch of 500 for generating their captions. Each batch took nearly two hours to generate the captions.
start = 1000
end = 1500
data = df_cc
generated_caption_filepath = f'/path/to/store/the/captions/{data}_captions_{start}-{end}.csv'

start_time = time.time()

for x in range(start,end):
    start = time.time()
    tweet = data.iloc[x]
    # Prompt not required as we didn't have the content for the task.
    # pr = prompt(x)
    st = str(caption_media(tweet['media'], "Define the details of the image in atmost 150 words."))
    print(x+1)
    # csv files is updated on real-time basis, i.e., as soon as a new captions is generated, it gets attached to the last of the file.
    # In some of the images, the captions generated were not in the ASCII format, so in place of them, 'Media attached but caption can't be generated' has to be attached maually as there are only 5 instances of these in the entire data.
    with open(generated_caption_filepath, 'a', newline='') as csvfile:
      csv_writer = csv.writer(csvfile)
      csv_writer.writerow([str(st)])
    print(time.time()-start)

print(time.time()-start_time)

# Time required for generating the captions for each image as well as the particular batch is provided through the above code.