In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

modify_qwen_tokenizer_dir = "/root/autodl-tmp/models/Qwen1.5-4B-Chat"
modify_qwen_tokenizer = AutoTokenizer.from_pretrained(modify_qwen_tokenizer_dir)

modify_qwen_tokenizer.encode("<image>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[151646]

In [5]:
len(modify_qwen_tokenizer)

151647

# llava模型初始化

In [2]:
clip_model_name_or_path = "/root/autodl-tmp/models/openai/clip-vit-large-patch14-336"
qwen_model_name_or_path = "/root/autodl-tmp/models/Qwen1.5-4B-Chat"

In [3]:
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoProcessor

clip_model = AutoModel.from_pretrained(clip_model_name_or_path, device_map="cuda:0")
llm_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name_or_path, device_map="cuda:0"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)
llm_tokenizer.encode("<image>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[151646]

In [5]:
from transformers import (
LlavaForConditionalGeneration,
LlavaConfig
)

将 clip 模型和 llm_model 模型的 config 拿出来，初始化一个 llava model

In [6]:
# Initializing a CLIP-vision config
vision_config = clip_model.vision_model.config

# Initializing a Llama config
text_config = llm_model.config

# Initializing a Llava llava-1.5-7b style configuration
configuration = LlavaConfig(vision_config, text_config)

# Initializing a model from the llava-1.5-7b style configuration
model = LlavaForConditionalGeneration(configuration)

In [7]:
# 使用的 CLIPVisionEmbeddings
model.vision_tower.vision_model.embeddings

CLIPVisionEmbeddings(
  (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
  (position_embedding): Embedding(577, 1024)
)

In [9]:
clip_model.vision_model.embeddings

CLIPVisionEmbeddings(
  (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
  (position_embedding): Embedding(577, 1024)
)

# 复制语言和视觉模型的权重给LLaVA

In [8]:
model.vision_tower.vision_model = clip_model.vision_model
model.language_model = llm_model

In [10]:
model.language_model.model.embed_tokens.weight.data[:, :2]

tensor([[ 4.7302e-03, -6.2866e-03],
        [-5.9814e-03,  1.1841e-02],
        [-5.8899e-03,  1.5747e-02],
        ...,
        [-3.3379e-05,  8.5831e-06],
        [-3.2425e-05,  7.1824e-06],
        [-3.1948e-05, -2.9206e-06]], device='cuda:0')

In [11]:
llm_model.model.embed_tokens.weight.data[:, :2]

tensor([[ 4.7302e-03, -6.2866e-03],
        [-5.9814e-03,  1.1841e-02],
        [-5.8899e-03,  1.5747e-02],
        ...,
        [-3.3379e-05,  8.5831e-06],
        [-3.2425e-05,  7.1824e-06],
        [-3.1948e-05, -2.9206e-06]], device='cuda:0')

复制 qwen 的pad_token_id给 LLaVA 模型

In [12]:
model.config.pad_token_id = llm_tokenizer.pad_token_id
model.config.pad_token_id

151643

In [13]:
model.config.image_token_index

32000

In [14]:
llm_tokenizer.encode("<image>")[0]

151646

In [15]:
model.config.image_token_index = llm_tokenizer.encode("<image>")[0]
model.config.image_token_index

151646

In [16]:
model.save_pretrained("/root/autodl-tmp/LLaVA/model/model001")

In [17]:
llm_tokenizer.save_pretrained("/root/autodl-tmp/LLaVA/model/model001")

('/root/autodl-tmp/LLaVA/model/model001/tokenizer_config.json',
 '/root/autodl-tmp/LLaVA/model/model001/special_tokens_map.json',
 '/root/autodl-tmp/LLaVA/model/model001/vocab.json',
 '/root/autodl-tmp/LLaVA/model/model001/merges.txt',
 '/root/autodl-tmp/LLaVA/model/model001/added_tokens.json',
 '/root/autodl-tmp/LLaVA/model/model001/tokenizer.json')

In [18]:
autoprocessor = AutoProcessor.from_pretrained(clip_model_name_or_path)
autoprocessor.save_pretrained("/root/autodl-tmp/LLaVA/model/model002")

[]

注意：主要需要把model/model002里面的preprocessor_config.json文件，放在model/model001里面

# 重启，测试效果

In [1]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch


model_name_or_path = "/root/autodl-tmp/LLaVA/model/model001"  # 
# model_name_or_path = "test_model_copy/model001"  #

llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name_or_path, device_map="cuda:0", torch_dtype=torch.bfloat16
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
from PIL import Image

prompt_text = "<image>\nWhat are these?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt_text},
]
prompt = llava_processor.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

image_path = "/root/autodl-tmp/LLaVA/MainAfter.jpg"
image = Image.open(image_path)


inputs = llava_processor(text=prompt, images=image, return_tensors="pt")

for tk in inputs.keys():
    inputs[tk] = inputs[tk].to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=20)
gen_text = llava_processor.batch_decode(
    generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]

print(gen_text)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
What are these?<|im_end|>
<|im_start|>assistant
These are the words in the sentence: "prends", "prends", "prends",
