In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.generation import GenerationConfig
from src.models.qwen.modeling_qwen import QWenLMHeadModel
import torch
torch.manual_seed(1234)



<torch._C.Generator at 0x7f9333aae670>

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-VL:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [4]:
model = QWenLMHeadModel.from_pretrained(
    "Qwen/Qwen-VL", 
    device_map="auto", 
    trust_remote_code=True, 
    torch_dtype=torch.float16,
    load_in_4bit=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_skip_modules=['visual']
    ),
    fp16=True,
).eval()


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Downloading shards:   0%|          | 0/10 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [5]:
query = tokenizer.from_list_format([
    {'image': 'notebook/mmmu_image.png'},
    # {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
    # {'text': 'Each of the following situations relates to a different company. For company B, find the missing amounts. \n\n A: $63,020 \n\n B: $58,410 \n\n C: $71,320 \n\n D: $77,490 \n\n Answer:'},
])
print(query)
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)

print(inputs.keys(), inputs.input_ids.shape, print(inputs.input_ids))

with torch.no_grad():
    pred = model.generate(**inputs, do_sample=False)

print(pred.shape)

response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
print(response)



Picture 1: <img>notebook/mmmu_image.png</img>

tensor([[ 24669,    220,     16,     25,    220, 151857,    110,    111,    116,
            101,     98,    111,    111,    107,     47,    109,    109,    109,
            117,     95,    105,    109,     97,    103,    101,     46,    112,
            110,    103, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859, 151859,
         151859, 151859, 151859, 151859, 151859, 151859, 15185

2024-02-06 15:59:07.565884: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-02-06 15:59:07.565941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


torch.Size([1, 412])
Picture 1: <img>notebook/mmmu_image.png</img>
Company A Company B Company C Company D 1 Revenues ? $1,480,500 $103,950 $1,054,116 2 Expenses $455,490 1,518,300 78,120 ? 3 Gains 0 ? 4,725 8,505 4 Losses 32,760 0 5,670 39,312 5 Net Income or (Loss) 32,130 39,690 ? ? (58,275)<|endoftext|>


In [6]:
with torch.no_grad():
    output = model(**inputs)

In [7]:
output.logits.shape

torch.Size([1, 264, 151936])

In [5]:
import datasets, json

In [3]:
ds = datasets.load_dataset('MMMU/MMMU', 'Accounting')

In [17]:
json.loads(ds['validation'][0]['options'].replace("'", '"'))

['$6', '$7', '$8', '$9']

In [None]:
query = tokenizer.from_list_format([
    {'image': ds['validation'][0]['image_1']},
    # {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
    # {'text': 'Each of the following situations relates to a different company. For company B, find the missing amounts. \n\n A: $63,020 \n\n B: $58,410 \n\n C: $71,320 \n\n D: $77,490 \n\n Answer:'},
])
query

TypeError: can only concatenate str (not "PngImageFile") to str

In [25]:
datasets.load_dataset('./cache/MMMU/Accounting', 'dev')

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield'],
        num_rows: 35
    })
    test: Dataset({
        features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield'],
        num_rows: 380
    })
})



tensor(9.0417e-07)

In [27]:
import json
json.loads('["One pathogen", "Two pathogens", "Three pathogens", "There is no pathogen involved", "I don\\"t know and I don\\"t want to guess"]')

['One pathogen',
 'Two pathogens',
 'Three pathogens',
 'There is no pathogen involved',
 'I don"t know and I don"t want to guess']

In [28]:
eval("['a']")

['a']