In [1]:
from constants import PATH_JSON_ANNOTATIONS, PATH_JSON_QUESTIONS, OPENAI_KEY, PROMPT_CONSTRUCAO_TEXTO, PROMPT_MIX_RESPONSE, PROMPT_SANITY_CHECK
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers import BitsAndBytesConfig
from PIL import Image
import json
import os
import pandas as pd
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# verificando se GPU está disponível

print(torch.cuda.is_available())
print("GPU atual:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Nenhuma")

True
GPU atual: NVIDIA GeForce RTX 4050 Laptop GPU


In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto", quantization_config=quantization_config
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

`torch_dtype` is deprecated! Use `dtype` instead!
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Exception in thread Thread-5 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\gabri\anaconda3\envs\rodar_modelos\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\gabri\anaconda3\envs\rodar_modelos\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\gabri\anaconda3\envs\rodar_modelos\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\gabri\anaconda3\envs\rodar_modelos\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc6 in position 

In [None]:


messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


## Generate answers functions

In [7]:
def generate_answers_image_only(df: pd.DataFrame, model):

    answers = []
    model_df = df.copy()
    base_image_path = 'images/'

    for i in model_df.index:
        image_infos = model_df.iloc[i]
        image_question = image_infos['question']
        image_path = base_image_path + image_infos['image_path']
        image = Image.open(image_path).convert("RGB")
        prompt_text = image_question + '' + 'Respond only with the final answer, without explanation or full sentences.'

        prompt_image_only = f"[INST] <image>\n{prompt_text} [/INST]"

        inputs = processor(
            text=prompt_image_only,
            images=image,
            return_tensors="pt",
            # padding=True
        ).to(model.device)

        output = model.generate(
            **inputs,
            max_new_tokens=100,
            pad_token_id=processor.tokenizer.eos_token_id
        )

        answer = processor.decode(
            output[0],
            skip_special_tokens=True,
        ).replace(image_question, "").strip()

        if "[/INST]" in answer:
            answer = answer.split("[/INST]")[-1].strip()
        else:
            answer = answer.strip()

        print(f"Resposta gerada para a imagem {i}: {answer}")
        answers.append(answer)

        del inputs
        del output  
        gc.collect() 
        
        if model.device.type == 'cuda':
            torch.cuda.empty_cache()

        if i % 100 == 0:
            print(f"Resposta gerada para {i} imagens.")

    return answers


In [6]:
def generate_answers_text_only(text_col: str, df: pd.DataFrame, model):

    answers = []
    model_df = df.copy()

    for i in model_df.index:
        image_infos = model_df.iloc[i]
        image_description = image_infos[text_col]
        image_question = image_infos['question']

        prompt_formatado = PROMPT_SANITY_CHECK.format(text=image_description, question=image_question)
   
        inputs = processor(
            text=prompt_formatado,
            return_tensors="pt"
        ).to(model.device)

        output = model.generate(**inputs, max_new_tokens=64, pad_token_id=processor.tokenizer.eos_token_id)
        answer = processor.decode(output[0], skip_special_tokens=True).replace(prompt_formatado, "").strip()
        answers.append(answer)

        if i % 100 == 0:
            print(f"Resposta gerada para {i} imagens.")

    return answers


In [5]:
def generate_answers_mix(text_col: str, df: pd.DataFrame, model):

    answers = []
    model_df = df.copy()
    base_image_path = 'images/'

    for i in model_df.index:
        image_infos = model_df.iloc[i]
        image_description = image_infos[text_col]
        image_question = image_infos['question']
        image_path = base_image_path + image_infos['image_path']
        image = Image.open(image_path).convert("RGB")

        prompt_formatado = PROMPT_MIX_RESPONSE.format(text_information=image_description, question=image_question)
        prompt_formatado = f"[INST] <image>\n{prompt_formatado} [/INST]"

        inputs = processor(
            text=prompt_formatado,
            images=image,
            return_tensors="pt",
            # padding=True
        ).to(model.device)

        output = model.generate(
            **inputs,
            max_new_tokens=100,
            pad_token_id=processor.tokenizer.eos_token_id
        )

        answer = processor.decode(
            output[0],
            skip_special_tokens=True,
        ).replace(image_question, "").strip()

        if "[/INST]" in answer:
            answer = answer.split("[/INST]")[-1].strip()
        else:
            answer = answer.strip()

        print(f"Resposta gerada para a imagem {i}: {answer}")
        answers.append(answer)

        del inputs
        del output  
        gc.collect() 
        
        if model.device.type == 'cuda':
            torch.cuda.empty_cache()

        if i % 100 == 0:
            print(f"Resposta gerada para {i} imagens.")

    return answers


In [8]:
df = pd.read_csv('data/final_data_descriptions.csv')

In [9]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,image_path,image_id,question,answer,Tm,Tc,Ans,Ti
0,0,0,COCO_val2014_000000000042.jpg,42,What color are the gym shoes?,white,A curly-haired dog is sleeping on a shoe rack ...,A curly-haired dog is sleeping on a shoe rack ...,red,"Several days after the low dissipated , the r..."
1,1,1,COCO_val2014_000000000073.jpg,73,What is the license number?,sv-6260,The motorcycle in the image has a license plat...,The motorcycle in the image has a license plat...,red,Kawaguchi 's Center Body of troops was planni...
2,2,2,COCO_val2014_000000000074.jpg,74,Does this dog have a collar?,no,The image shows a white dog sleeping on a cobb...,The image shows a white dog sleeping on a cobb...,red,"In the centre , the main attack along the Bui..."
3,3,3,COCO_val2014_000000000133.jpg,133,What color is lamp?,blue,The image shows a wooden loft bed with a small...,The image shows a wooden loft bed with a small...,red,Mount Elbert was named by miners in honor of ...
4,4,4,COCO_val2014_000000000136.jpg,136,Is this in a museum?,no,The image shows two giraffes in an indoor zoo ...,The image shows two giraffes in a natural sava...,red,"Runs west through Jackson , Mississippi , eve..."
...,...,...,...,...,...,...,...,...,...,...
991,991,991,COCO_val2014_000000014135.jpg,14135,Is it daytime?,yes,The image shows a skateboarder performing a tr...,The image shows a skateboarder performing a tr...,no,The documentary film Tim Richmond : To The Li...
992,992,992,COCO_val2014_000000014151.jpg,14151,Is this at the Olympics?,yes,A ski jumper is in mid-air above a snow-covere...,A ski jumper is in mid-air above a snow-covere...,"No, this is not at the Olympics.",Kevin Spacey as David <unk> \n
993,993,993,COCO_val2014_000000014167.jpg,14167,Is the skateboarder wearing safety gear?,no,The skateboarder is performing a trick down a ...,The skateboarder is performing a trick down a ...,blue,The Island Def Jam rapper Big K.R.I.T. was bo...
994,994,994,COCO_val2014_000000014175.jpg,14175,What is sticking up from the fire hydrant?,nothing,The image shows a street scene with a fire hyd...,The image shows a street scene with a fire hyd...,red,"A total of 2 @,@ 000 people attended Slammive..."


In [10]:
llava_df = df[['image_path', 'question', 'answer', 'Tm', 'Tc', 'Ti']]

In [11]:
llava_df

Unnamed: 0,image_path,question,answer,Tm,Tc,Ti
0,COCO_val2014_000000000042.jpg,What color are the gym shoes?,white,A curly-haired dog is sleeping on a shoe rack ...,A curly-haired dog is sleeping on a shoe rack ...,"Several days after the low dissipated , the r..."
1,COCO_val2014_000000000073.jpg,What is the license number?,sv-6260,The motorcycle in the image has a license plat...,The motorcycle in the image has a license plat...,Kawaguchi 's Center Body of troops was planni...
2,COCO_val2014_000000000074.jpg,Does this dog have a collar?,no,The image shows a white dog sleeping on a cobb...,The image shows a white dog sleeping on a cobb...,"In the centre , the main attack along the Bui..."
3,COCO_val2014_000000000133.jpg,What color is lamp?,blue,The image shows a wooden loft bed with a small...,The image shows a wooden loft bed with a small...,Mount Elbert was named by miners in honor of ...
4,COCO_val2014_000000000136.jpg,Is this in a museum?,no,The image shows two giraffes in an indoor zoo ...,The image shows two giraffes in a natural sava...,"Runs west through Jackson , Mississippi , eve..."
...,...,...,...,...,...,...
991,COCO_val2014_000000014135.jpg,Is it daytime?,yes,The image shows a skateboarder performing a tr...,The image shows a skateboarder performing a tr...,The documentary film Tim Richmond : To The Li...
992,COCO_val2014_000000014151.jpg,Is this at the Olympics?,yes,A ski jumper is in mid-air above a snow-covere...,A ski jumper is in mid-air above a snow-covere...,Kevin Spacey as David <unk> \n
993,COCO_val2014_000000014167.jpg,Is the skateboarder wearing safety gear?,no,The skateboarder is performing a trick down a ...,The skateboarder is performing a trick down a ...,The Island Def Jam rapper Big K.R.I.T. was bo...
994,COCO_val2014_000000014175.jpg,What is sticking up from the fire hydrant?,nothing,The image shows a street scene with a fire hyd...,The image shows a street scene with a fire hyd...,"A total of 2 @,@ 000 people attended Slammive..."


In [None]:
llava_tm_responses = generate_answers_text_only('Tm', llava_df, model)

In [35]:
llava_ti_responses = generate_answers_text_only('Ti', llava_df, model)

Resposta gerada para 0 imagens.
Resposta gerada para 100 imagens.
Resposta gerada para 200 imagens.
Resposta gerada para 300 imagens.
Resposta gerada para 400 imagens.
Resposta gerada para 500 imagens.
Resposta gerada para 600 imagens.
Resposta gerada para 700 imagens.
Resposta gerada para 800 imagens.
Resposta gerada para 900 imagens.


In [34]:
llava_tc_responses = generate_answers_text_only('Tc', llava_df, model)

Resposta gerada para 0 imagens.
Resposta gerada para 100 imagens.
Resposta gerada para 200 imagens.
Resposta gerada para 300 imagens.
Resposta gerada para 400 imagens.
Resposta gerada para 500 imagens.
Resposta gerada para 600 imagens.
Resposta gerada para 700 imagens.
Resposta gerada para 800 imagens.
Resposta gerada para 900 imagens.


In [37]:
llava_df['Tc_responses'] = pd.Series(llava_tc_responses)
llava_df['Tm_responses'] = pd.Series(llava_tm_responses)
llava_df['Ti_responses'] = pd.Series(llava_ti_responses)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llava_df['Tc_responses'] = pd.Series(llava_tc_responses)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llava_df['Tm_responses'] = pd.Series(llava_tm_responses)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llava_df['Ti_responses'] = pd.Series(llava_ti_responses)


In [38]:
llava_df

Unnamed: 0,image_path,question,answer,Tm,Tc,Ti,Tc_responses,Tm_responses,Ti_responses
0,COCO_val2014_000000000042.jpg,What color are the gym shoes?,white,A curly-haired dog is sleeping on a shoe rack ...,A curly-haired dog is sleeping on a shoe rack ...,"Several days after the low dissipated , the r...",Red,Blue,No gym shoes in the text.
1,COCO_val2014_000000000073.jpg,What is the license number?,sv-6260,The motorcycle in the image has a license plat...,The motorcycle in the image has a license plat...,Kawaguchi 's Center Body of troops was planni...,AB-1234,SV-6260,Answer the following question given the text:\...
2,COCO_val2014_000000000074.jpg,Does this dog have a collar?,no,The image shows a white dog sleeping on a cobb...,The image shows a white dog sleeping on a cobb...,"In the centre , the main attack along the Bui...",Yes,Yes,Answer the following question given the text:\...
3,COCO_val2014_000000000133.jpg,What color is lamp?,blue,The image shows a wooden loft bed with a small...,The image shows a wooden loft bed with a small...,Mount Elbert was named by miners in honor of ...,Red,Blue,Answer the following question given the text:\...
4,COCO_val2014_000000000136.jpg,Is this in a museum?,no,The image shows two giraffes in an indoor zoo ...,The image shows two giraffes in a natural sava...,"Runs west through Jackson , Mississippi , eve...",No,No,No
...,...,...,...,...,...,...,...,...,...
991,COCO_val2014_000000014135.jpg,Is it daytime?,yes,The image shows a skateboarder performing a tr...,The image shows a skateboarder performing a tr...,The documentary film Tim Richmond : To The Li...,No,Yes,Yes
992,COCO_val2014_000000014151.jpg,Is this at the Olympics?,yes,A ski jumper is in mid-air above a snow-covere...,A ski jumper is in mid-air above a snow-covere...,Kevin Spacey as David <unk> \n,No,Yes,Answer the following question given the text:\...
993,COCO_val2014_000000014167.jpg,Is the skateboarder wearing safety gear?,no,The skateboarder is performing a trick down a ...,The skateboarder is performing a trick down a ...,The Island Def Jam rapper Big K.R.I.T. was bo...,Yes,No,Answer the following question given the text:\...
994,COCO_val2014_000000014175.jpg,What is sticking up from the fire hydrant?,nothing,The image shows a street scene with a fire hyd...,The image shows a street scene with a fire hyd...,"A total of 2 @,@ 000 people attended Slammive...",Red object,Blue object,No fire hydrant in the text.


In [59]:
# ti responses ficou poluido com prompt em alguns casos. Tratando isso
cleaned_llava_ti_responses = []

for ti in llava_ti_responses:

    if 'full sentences' in ti:
        t = ti.split('full sentences')[-1].replace('\n','').replace('.','')
        cleaned_llava_ti_responses.append(t)
    else:
        cleaned_llava_ti_responses.append(ti)

In [60]:
llava_df['Ti_responses'] = pd.Series(cleaned_llava_ti_responses)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llava_df['Ti_responses'] = pd.Series(cleaned_llava_ti_responses)


In [61]:
llava_df

Unnamed: 0,image_path,question,answer,Tm,Tc,Ti,Tc_responses,Tm_responses,Ti_responses
0,COCO_val2014_000000000042.jpg,What color are the gym shoes?,white,A curly-haired dog is sleeping on a shoe rack ...,A curly-haired dog is sleeping on a shoe rack ...,"Several days after the low dissipated , the r...",Red,Blue,No gym shoes in the text.
1,COCO_val2014_000000000073.jpg,What is the license number?,sv-6260,The motorcycle in the image has a license plat...,The motorcycle in the image has a license plat...,Kawaguchi 's Center Body of troops was planni...,AB-1234,SV-6260,0
2,COCO_val2014_000000000074.jpg,Does this dog have a collar?,no,The image shows a white dog sleeping on a cobb...,The image shows a white dog sleeping on a cobb...,"In the centre , the main attack along the Bui...",Yes,Yes,No
3,COCO_val2014_000000000133.jpg,What color is lamp?,blue,The image shows a wooden loft bed with a small...,The image shows a wooden loft bed with a small...,Mount Elbert was named by miners in honor of ...,Red,Blue,No lamp in the text
4,COCO_val2014_000000000136.jpg,Is this in a museum?,no,The image shows two giraffes in an indoor zoo ...,The image shows two giraffes in a natural sava...,"Runs west through Jackson , Mississippi , eve...",No,No,No
...,...,...,...,...,...,...,...,...,...
991,COCO_val2014_000000014135.jpg,Is it daytime?,yes,The image shows a skateboarder performing a tr...,The image shows a skateboarder performing a tr...,The documentary film Tim Richmond : To The Li...,No,Yes,Yes
992,COCO_val2014_000000014151.jpg,Is this at the Olympics?,yes,A ski jumper is in mid-air above a snow-covere...,A ski jumper is in mid-air above a snow-covere...,Kevin Spacey as David <unk> \n,No,Yes,No
993,COCO_val2014_000000014167.jpg,Is the skateboarder wearing safety gear?,no,The skateboarder is performing a trick down a ...,The skateboarder is performing a trick down a ...,The Island Def Jam rapper Big K.R.I.T. was bo...,Yes,No,No
994,COCO_val2014_000000014175.jpg,What is sticking up from the fire hydrant?,nothing,The image shows a street scene with a fire hyd...,The image shows a street scene with a fire hyd...,"A total of 2 @,@ 000 people attended Slammive...",Red object,Blue object,No fire hydrant in the text.


In [23]:
# selecionando apenas 10 amostras pelo custo

mini_llava_df = llava_df.sample(n=10, random_state=42)
mini_llava_df = mini_llava_df.reset_index(drop=True)

In [24]:
mini_llava_df

Unnamed: 0,image_path,question,answer,Tm,Tc,Ti
0,COCO_val2014_000000011703.jpg,Was this picture taken in front of a door way?,yes,The image shows the interior of a train with a...,The image shows the interior of a train with a...,A Little Matter of Genocide . San Francisco C...
1,COCO_val2014_000000013867.jpg,What object is the person carrying?,frisbee,"The person is standing on a grassy field, wear...","The person is standing on a grassy field, wear...",Perhaps the most enduring legacy of the Mongo...
2,COCO_val2014_000000001268.jpg,What color is the grass?,green,The image shows a scene under a bridge by a ri...,The image shows a scene under a bridge by a ri...,"Adrien Begrand of PopMatters remarked that "" ..."
3,COCO_val2014_000000008292.jpg,What room is this?,kitchen,"The image shows a narrow kitchen with a stove,...",The image shows a narrow bathroom with a batht...,"McCarty grew up in Muskogee , Oklahoma . Afte..."
4,COCO_val2014_000000006471.jpg,What do their shirts' say?,10,"The players' shirts have the word ""Bowie"" writ...","The players' shirts have the word ""Blue"" writt...","The writer of the scenario is unknown , but i..."
5,COCO_val2014_000000003817.jpg,What is this man hauling?,bananas,"A man is riding a motorcycle on a rural road, ...","A man is riding a motorcycle on a rural road, ...","In this episode , Federal Bureau of Investiga..."
6,COCO_val2014_000000004134.jpg,What is the person on the left doing with thei...,shaking,The person on the left is shaking hands with t...,The person on the left is holding a glass of w...,NME lauded the song as the opening track by s...
7,COCO_val2014_000000002061.jpg,Who is in the toilet?,no one,"The toilet is empty, with a blue cleaning brus...",The toilet is occupied by a person wearing a b...,The video begins with an aerial shot of a blo...
8,COCO_val2014_000000009448.jpg,Is this girl eating a cookie?,yes,The girl is holding a blue umbrella and appear...,The girl is holding a blue umbrella and appear...,"With its sequels for the Genesis , Sonic the ..."
9,COCO_val2014_000000008128.jpg,Is the sky dark and overcast?,no,The sky is clear and blue with some light clou...,The sky is dark and overcast with heavy clouds...,"In a return match on 19 January , Yorkshire f..."


In [25]:
llava_base_response = generate_answers_image_only(mini_llava_df, model)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
llava_mix_ti_responses = generate_answers_mix('Ti',mini_llava_df, model)

In [None]:
llava_mix_tc_responses = generate_answers_mix('Tc',mini_llava_df, model)

In [22]:
llava_mix_tm_responses = generate_answers_mix('Tm',mini_llava_df, model)

Resposta gerada para a imagem 0: Yes
Resposta gerada para 0 imagens.
Resposta gerada para a imagem 1: Frisbee
Resposta gerada para a imagem 2: Green
Resposta gerada para a imagem 3: Kitchen
Resposta gerada para a imagem 4: Bowie
Resposta gerada para a imagem 5: Bananas
Resposta gerada para a imagem 6: Shaking
Resposta gerada para a imagem 7: No one
Resposta gerada para a imagem 8: Yes
Resposta gerada para a imagem 9: No
Resposta gerada para a imagem 10: Green and white
Resposta gerada para a imagem 11: Parked
Resposta gerada para a imagem 12: Oven
Resposta gerada para a imagem 13: No
Resposta gerada para a imagem 14: Blue
Resposta gerada para a imagem 15: 1.39
Resposta gerada para a imagem 16: S
Resposta gerada para a imagem 17: 4
Resposta gerada para a imagem 18: No
Resposta gerada para a imagem 19: No
Resposta gerada para a imagem 20: Yes
Resposta gerada para a imagem 21: No
Resposta gerada para a imagem 22: Stripes
Resposta gerada para a imagem 23: Yes
Resposta gerada para a imagem 

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
mini_llava_df['base_responses'] = pd.Series(llava_base_response)
mini_llava_df['mix_tc_responses'] = pd.Series(llava_mix_tc_responses)
mini_llava_df['mix_tm_responses'] = pd.Series(llava_mix_tm_responses)
mini_llava_df['mix_ti_responses'] = pd.Series(llava_mix_ti_responses)

In [None]:
mini_llava_df.to_csv('data/mini_llava_7b_responses.csv')