In [None]:
import os
import sys
import torch
import transformers

sys.path.append('../MLLM')
from conversation import conv_templates
from constants import DEFAULT_MMODAL_TOKEN, MMODAL_TOKEN_INDEX
from mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, process_video, process_image
from model.builder import load_pretrained_model

model_path = '/data/vllm/VideoLLaMA2-main/work_dirs/mamba_pretrain_v1/mamba_pretrain_v1'

model_name = get_model_name_from_path(model_path)

model_base = '/data/vllm/VideoLLaMA2-main/mistral_ckpt/Mistral-7B-Instruct-v0___2'

tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name)
model = model.to('cuda')

conv_mode = 'mistral'

In [3]:
from peft import PeftModel
lora_weight = '../work_dirs/mamba_sft/finetune_mamba_sft_91_2stage_1mamba_2mlp_add_pos'
print(f"Loading LoRA weights from {lora_weight}")
peft_model = PeftModel.from_pretrained(model, lora_weight).cuda()

Loading LoRA weights from /data/vllm/VideoLLaMA2-main/work_dirs/mamba_sft/finetune_mamba_sft_91_2stage_1mamba_2mlp_add_pos


In [None]:

pretrain_mm_mlp_adapter_v2 = '../work_dirs/mamba_sft/finetune_mamba_sft_91_2stage_1mamba_2mlp_add_pos/non_lora_trainables.bin'

mm_projector_weights = torch.load(pretrain_mm_mlp_adapter_v2, map_location='cpu')


def get_w(weights, keyword):
    return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}

peft_model.base_model.model.model.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)

In [6]:
def inference(question, video):
    # Video Inference
    paths = [video]
    questions = [question]
    
    if '.jpg' in video or 'png' in video:
        modal_list = ['image']
    else:
        modal_list = ['video']
    # Visual preprocess (load & transform image or video).
    if modal_list[0] == 'video':
        tensor = process_video(paths[0], processor, model.config.image_aspect_ratio, num_frames=16).to(dtype=torch.float16, device='cuda', non_blocking=True)
        default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
        modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
    else:
        tensor = process_image(paths[0], processor, model.config.image_aspect_ratio)[0].to(dtype=torch.float16, device='cuda', non_blocking=True)
        default_mm_token = DEFAULT_MMODAL_TOKEN["IMAGE"]
        modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
    tensor = [tensor]

    # Text preprocess (tag process & generate prompt).
    question = default_mm_token + "\n" + questions[0]
    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], question)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda')

    with torch.inference_mode():
        output_ids = peft_model.generate(
            input_ids,
            images_or_videos=tensor,
            modal_list=modal_list,
            do_sample=True,
            temperature=0.2,
            max_new_tokens=1024,
            use_cache=True,
        )

    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return outputs[0]

In [None]:
video_path = '/data/vllm/datasets/othervideos/combine_dataset/00073.mp4'

question = "Please analyze the content of the video and comprehensively summarize the reasons causing the abnormal events. Ensure that you cover all possible reasons for the abnormal events as comprehensively as possible. The output format must strictly follow the following formats:1.xxxx\n2.xxxxx\nFor example,1.The quarrel between the two men became more and more serious.\n2.The car that collided was speeding.\n3.The thief wanted to steal valuables."
res = inference(question,video_path)
res