In [2]:
import torch
from transformers import AutoModelForCausalLM
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images

# 0. 사전 준비: deepseek-vl 패키지 설치 필요
# 터미널에서 실행: pip install deepseek-vl

# 1. 모델 및 프로세서 로드
# "deepseek-ai/deepseek-vl-7b-chat" 또는 작은 모델 "deepseek-ai/deepseek-vl-1.3b-chat" 선택
model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

print(f"Loading DeepSeek-VL2 model: {model_path}...")
# bfloat16을 지원하는 GPU라면 torch.bfloat16 사용 권장
model: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, 
    trust_remote_code=True,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
)
model = model.to("cuda")

# 2. 대화 템플릿 구성
def run_deepseek(image_path, prompt):
    conversation = [
        {
            "role": "User",
            "content": f"<image_placeholder>{prompt}",
            "images": [image_path]
        },
        {
            "role": "Assistant",
            "content": ""
        }
    ]

    # 3. 입력 처리
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation,
        images=pil_images,
        force_batchify=True
    ).to(model.device)

    # 4. 임베딩 생성 (이미지+텍스트)
    inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)

    # 5. 답변 생성
    with torch.no_grad():
        outputs = model.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,
            do_sample=False,
            use_cache=True
        )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    return answer

# --- 테스트 실행 ---
img_path = "./samples/1.png" # 이미지 경로 확인

print("--- DeepSeek-VL2 Test ---")
# 1. 이미지 설명
print("Caption:", run_deepseek(img_path, "Describe this image in detail."))

# 2. 질문 하기
print("Answer:", run_deepseek(img_path, "What is the main object in the picture?"))

Loading DeepSeek-VL2 model: deepseek-ai/deepseek-vl-1.3b-chat...
--- DeepSeek-VL2 Test ---
Caption: The image captures a serene scene of a rocky shore at sunset. The sky, awash with hues of pink and orange, serves as a stunning backdrop to the tranquil water. The rocks, varying in size and shape, are scattered across the water, their gray and black colors contrasting beautifully with the vibrant sky. The perspective of the image is from the shore, looking out towards the water, giving a sense of depth and distance. In the distance, the silhouette of mountains can be seen, adding a sense of grandeur to the scene. The image does not provide any specific details that could be used to identify the landmark as 'sa_1300'.
Answer: The main object in the picture is a group of rocks or boulders floating on water.


In [4]:
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images


# specify the path to the model
model_path = "deepseek-ai/deepseek-vl2-tiny"
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

## single image conversation example
## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations.
## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper.
conversation = [
    {
        "role": "<|User|>",
        "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
        "images": ["./images/visual_grounding_1.jpeg"],
    },
    {"role": "<|Assistant|>", "content": ""},
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
    conversations=conversation,
    images=pil_images,
    force_batchify=True,
    system_prompt=""
).to(vl_gpt.device)

# run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# run the model to get the response
outputs = vl_gpt.language.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
print(f"{prepare_inputs['sft_format'][0]}", answer)

ModuleNotFoundError: No module named 'deepseek_vl2'