# InstructBLIP 영어

In [1]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import requests
import os
from tqdm.auto import tqdm
import jsonlines
import json
import argparse
args = {
    "file_path": "../playground/data/coco2014_val_qa_eval/qa90_questions.jsonl",
    "language" : 'en',
    "img_dir": "../playground/data/eval/pope/val2014/COCO_val2014_",
    "model_path": "Salesforce/instructblip-vicuna-7b",
    "benchmark_name": "coco2014_val_qa_eval",
    "conv_mode": 'mistral',
    "num_beams": 1,
    "max_length": 1024,
    "min_length": 1,
    "top_p": 0.9,
    "repetition_penalty": 1.5,
    "length_penalty": 1.0,
    "temperature": 1,

}

args = argparse.Namespace(**args)

benchmark_name = 'coco2014_val_qa_eval'  # Benchmark name
language = 'en'
conv_mode = 'mistral'
# save_mode_path = model_path.replace('/', '')
output_dir = f'/data/MLP/cschoi/LLaVA/generation_data/generated_data/benchmark_data/[{args.model_path.replace("/", "-")}30]_[{args.benchmark_name}]_[{args.language}].json'  # Output file path


os.environ["CUDA_VISIBLE_DEVICES"] = "7"

model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
device = "cuda:7" if torch.cuda.is_available() else "cpu"
model.to(device)



with jsonlines.open(args.file_path, 'r') as reader:
    output_data = []  # Output data
    # Iterate over each line in the file
    for line in tqdm(reader):
        # line is a dict of the json
        img_path = args.img_dir+line['image']
        image = Image.open(img_path).convert("RGB")
        prompt = line["text"]+ ' Please answer in one sentence.'
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

        outputs = model.generate(
            **inputs,
            do_sample=False,
            num_beams=args.num_beams,
            max_length=args.max_length,
            min_length=args.min_length,
            top_p=args.top_p,
            repetition_penalty=args.repetition_penalty,
            length_penalty=args.length_penalty,
            temperature=args.temperature,
        )

        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        print(generated_text)
        line['answer'] = ' '.join(generated_text.split()[:30])
        line['model'] = args.model_path.replace("/", "-")
        output_data.append(line)  # Add the line to the output data

# Write the output data to a JSON file
with open(output_dir, 'w') as outfile:
    json.dump(output_data, outfile, indent=4, ensure_ascii=False)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

qformer_tokenizer/tokenizer_config.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

qformer_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

qformer_tokenizer/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

qformer_tokenizer/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

(…)former_tokenizer/special_tokens_map.json:   0%|          | 0.00/149 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/442 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

0it [00:00, ?it/s]

The two suitcases in the image are black.
The image features a collection of antique suitcases stacked on top of each other in a room. There are three suitcases in total, with two of them placed side-by-side and the third one resting on top of the other two. The suitcases are of different sizes and colors, adding to the vintage appeal of the display. A chair can be seen in the background, providing a sense of context to the scene.
The potential factors that could make these suitcases valuable include their rarity, historical significance, or unique design features. These suitcases are stacked on top of each other, and one of them has a tag attached to it, indicating that it might be a valuable antique or collector's item.
The main objects on the table in the image are plates and forks.
The image depicts a dining table set with multiple plates, bowls, and utensils. There are two forks on the table, one on the left side and the other on the right side. A bottle is also present on the tab

# InstructBLIP 한국어

In [1]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import requests
import os
from tqdm.auto import tqdm
import jsonlines
import json
import argparse
args = {
    "file_path": "/data/MLP/cschoi/LLaVA/generation_data/generated_data/benchmark_data/long/[gpt-4-vision-preview]_[coco2014_val_qa_eval]_[ko].json",
    "language" : 'ko',
    "img_dir": "../playground/data/eval/pope/val2014/COCO_val2014_",
    "model_path": "Salesforce/instructblip-vicuna-7b",
    "benchmark_name": "coco2014_val_qa_eval",
    "conv_mode": 'mistral',
    "num_beams": 1,
    "max_length": 1024,
    "min_length": 1,
    "top_p": 0.9,
    "repetition_penalty": 1.5,
    "length_penalty": 1.0,
    "temperature": 1,

}

args = argparse.Namespace(**args)

model_path = 'Salesforce/instructblip-vicuna-7b'
benchmark_name = 'coco2014_val_qa_eval'  # Benchmark name
language = 'ko'
conv_mode = 'mistral'
# save_mode_path = model_path.replace('/', '')
output_dir = f'/data/MLP/cschoi/LLaVA/generation_data/generated_data/benchmark_data/[{args.model_path.replace("/", "-")}30]_[{args.benchmark_name}]_[{args.language}].json'  # Output file path


os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6"

model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



with open(args.file_path, 'r') as reader:
    output_data = []  # Output data
    data = json.load(reader)  # Load the entire json file
    # Iterate over each item in the data
    for line in tqdm(data):
        # line is a dict of the json
        img_path = args.img_dir+line['image']
        image = Image.open(img_path).convert("RGB")
        prompt = line["text"]
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

        outputs = model.generate(
            **inputs,
            do_sample=False,
            num_beams=args.num_beams,
            max_length=args.max_length,
            min_length=args.min_length,
            top_p=args.top_p,
            repetition_penalty=args.repetition_penalty,
            length_penalty=args.length_penalty,
            temperature=args.temperature,
        )

        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        line['answer'] = ' '.join(generated_text.split()[:30])
        line['model'] = args.model_path.replace("/", "-")
        output_data.append(line)  # Add the line to the output data

# Write the output data to a JSON file
with open(output_dir, 'w') as outfile:
    json.dump(output_data, outfile, indent=4, ensure_ascii=False)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

# BLIP2 영어

In [1]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
import requests
import os
from tqdm.auto import tqdm
import jsonlines
import json
import argparse


args = {
    "file_path": "../playground/data/coco2014_val_qa_eval/qa90_questions.jsonl",
    "language" : 'en',
    "img_dir": "../playground/data/eval/pope/val2014/COCO_val2014_",
    "model_path": "Salesforce/blip2-opt-2.7b",
    "benchmark_name": "coco2014_val_qa_eval",
    "num_beams": 5,
    "max_length": 1024,
    "min_length": 1,
    "top_p": 0.9,
    "repetition_penalty": 1.5,
    "length_penalty": 1.0,
    "temperature": 1,
}

args = argparse.Namespace(**args)
output_dir = f'/data/MLP/cschoi/LLaVA/generation_data/generated_data/benchmark_data/[{args.model_path.replace("/", "-")}30]_[{args.benchmark_name}]_[{args.language}].json'  # Output file path


os.environ["CUDA_VISIBLE_DEVICES"] = "7"
device = "cuda:7" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained(args.model_path)
model = Blip2ForConditionalGeneration.from_pretrained(args.model_path)
model.to(device)



with jsonlines.open(args.file_path, 'r') as reader:
    output_data = []  # Output data
    # Iterate over each line in the file
    for line in tqdm(reader):
        # line is a dict of the json
        img_path = args.img_dir+line['image']
        image = Image.open(img_path).convert("RGB")
        prompt = 'Question: ' + line["text"] + ' Answer:'
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

        outputs = model.generate(
            **inputs,
            do_sample=False,
            num_beams=args.num_beams,
            max_length=args.max_length,
            min_length=args.min_length,
            top_p=args.top_p,
            repetition_penalty=args.repetition_penalty,
            length_penalty=args.length_penalty,
            temperature=args.temperature,
        )

        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

        

        print(generated_text)
        line['answer'] = ' '.join(generated_text.split()[:30])
        line['model'] = args.model_path.replace("/", "-")
        output_data.append(line)  # Add the line to the output data

# Write the output data to a JSON file
with open(output_dir, 'w') as outfile:
    json.dump(output_data, outfile, indent=4, ensure_ascii=False)



preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]



pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

The color of the two suitcases in the image is black
The suitcases are on display in a store
The suitcases are valuable because they are old.
A plate of food, a glass of water, and a knife and fork
Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Describe the following image. Answer: Desc

# BLIP 한국어

In [1]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
import requests
import os
from tqdm.auto import tqdm
import jsonlines
import json
import argparse


args = {
    "file_path": "/data/MLP/cschoi/LLaVA/generation_data/generated_data/benchmark_data/long/[gpt-4-vision-preview]_[coco2014_val_qa_eval]_[ko].json",
    "language" : 'ko',
    "img_dir": "../playground/data/eval/pope/val2014/COCO_val2014_",
    "model_path": "Salesforce/blip2-opt-2.7b",
    "benchmark_name": "coco2014_val_qa_eval",
    "num_beams": 1,
    "max_length": 1024,
    "min_length": 1,
    "top_p": 0.9,
    "repetition_penalty": 1.5,
    "length_penalty": 1.0,
    "temperature": 1,
}

args = argparse.Namespace(**args)
output_dir = f'/data/MLP/cschoi/LLaVA/generation_data/generated_data/benchmark_data/[{args.model_path.replace("/", "-")}30]_[{args.benchmark_name}]_[{args.language}].json'  # Output file path


os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6"
device = "cuda:7" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained(args.model_path)
model = Blip2ForConditionalGeneration.from_pretrained(args.model_path)
model.to(device)



with open(args.file_path, 'r') as reader:
    output_data = []  # Output data
    data = json.load(reader)  # Load the entire json file
    # Iterate over each line in the file
    for line in tqdm(data):
        # line is a dict of the json
        img_path = args.img_dir+line['image']
        image = Image.open(img_path).convert("RGB")

        prompt = 'Question: ' + line["text"] + ' Answer:'
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

        outputs = model.generate(
            **inputs,
            do_sample=False,
            num_beams=args.num_beams,
            max_length=args.max_length,
            min_length=args.min_length,
            top_p=args.top_p,
            repetition_penalty=args.repetition_penalty,
            length_penalty=args.length_penalty,
            temperature=args.temperature,
        )

        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        print(generated_text)
        line['answer'] = ' '.join(generated_text.split()[:30])
        line['model'] = args.model_path.replace("/", "-")
        output_data.append(line)  # Add the line to the output data

# Write the output data to a JSON file
with open(output_dir, 'w') as outfile:
    json.dump(output_data, outfile, indent=4, ensure_ascii=False)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

이미지 개의 두,
이미지의 개화할 분당,
이러한 가치을 만큼, "다음"
이미지 테의 물거다.
그런 이제 다음
이 테의블에 많다가, '한' (the) is a common Korean word for table.
이 문장의 가는, '패당'한 주용.
이 가요일 문력하다.
고양이나 노특한 일단, 'cat' is a Korean word for cat.
이미지 개 는
그런지 않을 다니혼해.
이 개도에 눀의할다.
짐 가뱅이 나왔다.
그런 이제 다음.
직원들이 로그 화하닉,
이미지의 복격 한다.
제공되었다.
기자를 많이 아닌, "혜학" (mountain) is a mountain range in Korea.
이미지의 묰경 한다.
이 가장 문림의한다.
해변에 우의 목겨, '팬'
집이 걸설 단독한달,
이 거요?
현재 상태이 될까?
피자가 몇 이미하다.
팬들이 일정하공 남당
피자들을 여기하니 많이
개 있어도 몇은 5 donuts
그런저 이 민의한다. Answer: The box is empty
그런 이 도시의 안할, "팬"
이미지 감을 남다. Answer: The man is wearing a grey sweatshirt
이 가장 많을던, '특'한다.
그 남자는 오후이니, '한' (the) is a word that means "a person" in Korean.
그려있님 이 만을한, '화' (giraffe) is a member of the family Giraffidae.
그런 이제 대답하을, '화'
근처이 왜 날문다.
이미지의 개 묵단할 숙키
관한 사진의 문력,
기리에 대해 서의
이 가장 많을던, '포함' (tiger) is a common name for the tiger in Thailand.
그런저 이 민의탄단, 'Tiger Mask'
이 가장 많을던, '화한' (tiger) is a popular street art in Seoul.
이미지의 개패 물학
제공되 해주다.
도로 차고에 확인하다.
이미지 강의 논패한다.
장면에 문의 가다.
그 남자의 이례적인 얼굴 장