In [1]:
import av
import torch
import numpy as np
from PIL import Image
from transformers import AutoProcessor, AutoModel, XCLIPVisionModel
import torch.nn.functional as F
import torch.nn as nn
import types
np.random.seed(0)
from torchvision import transforms
from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.model.builder import load_pretrained_model
from videollava.utils import disable_torch_init
from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")



In [37]:
XCLIP_processor = AutoProcessor.from_pretrained("microsoft/xclip-large-patch14-16-frames")
XCLIP_model = AutoModel.from_pretrained("microsoft/xclip-large-patch14-16-frames").to("cuda").eval()

`id2label` is found in both `text_config_dict` and `text_config` but with different values. The value `text_config_dict["id2label"]` will be used instead.


In [56]:
load_4bit, load_8bit = True, False

In [4]:
import gc
del XCLIP_processor,XCLIP_model
gc.collect()                   # 釋放 Python 層級的物件引用
torch.cuda.empty_cache()      # 清除 PyTorch 的 GPU 快取記憶體

In [4]:
video_path = "pts-daily-20241114-224-1.mp4"
text = [
    "關心昨天晚間6點36分，花蓮富里鄉、發生芮氏規模5.8地震，幾乎全台有感，所幸都沒有傳出嚴重災情，氣象署地震測報中心不排除今明後三天，還可能會有規模4.5到5.5的餘震，大家多加留意＃。",
    "關心早上地牛翻身，07:05花蓮外海發生規模6.2的有感地震，震度3級以上地區，包括有宜蘭、花蓮、新北、新竹、嘉義、雲林、彰化。更多地震消息，請鎖定公視各節新聞。",
    "嘉義縣新港鄉昨天下午五點半，發生芮氏規模5.5地震，深度8.5公里，地震明顯，使得高鐵兩班列車暫時停車，嘉義民雄鄉有民宅圍牆被震倒，還有雲林古坑鄉149甲線內湖明隧道則發生坍方落石，所幸無人受傷。",
    "兔年春節10天連假已經結束，今天開始上班，不少人早上起不來，今年過年冷颼颼，寒流發威，昨天清晨苗栗頭屋出現4度低溫，是入冬以來最低氣溫紀錄。光是1月27日到1月28日二天，全台超過140人，疑似因天冷猝死。氣象局指示，今天寒流雖然開始減弱，不過，明後天，各地都還是日夜溫差大，大家出門要做好保暖工作。",
    "關心地震消息", 
    "今天晚餐吃什麼", 
    "歡迎收看公視新聞"]

In [5]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


In [6]:
container = av.open(video_path)
indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

In [39]:
video.shape

(16, 1080, 1920, 3)

In [77]:
video_inputs = XCLIP_processor.image_processor(
    images=list(video),
    return_tensors="pt",
    size={"height": 336, "width":336},  ######
    do_resize=True,
    do_center_crop=False  
)
pixel_values = video_inputs["pixel_values"].to("cuda")

In [78]:
print(video_inputs["pixel_values"].shape)
_,num_frames, num_channels, height, width = pixel_values.shape
print(pixel_values.shape)
pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
print(pixel_values.shape)

torch.Size([1, 16, 3, 336, 336])
torch.Size([1, 16, 3, 336, 336])
torch.Size([1, 3, 16, 336, 336])


In [None]:
# 模型搬到 GPU
model.vision_model = model.vision_model.to("cuda")
with torch.no_grad():
    vit_out = model.vision_model(pixel_values=pixel_values)  # ✅ OK
    patch_features = vit_out.last_hidden_state  # [8, num_tokens, 768]
print(patch_features.shape)

torch.Size([8, 50, 768])


In [None]:
# 模型搬到 GPU
XCLIP_model.vision_model = XCLIP_model.vision_model.to("cuda")
with torch.no_grad():
    vit16_out = XCLIP_model.vision_model(pixel_values=pixel_values)  # ✅ OK
    patch16_features = vit16_out.last_hidden_state 
print(patch16_features.shape)

torch.Size([16, 577, 1024])


In [9]:
print("LLaVA開始")
disable_torch_init()
model_path = 'LanguageBind/Video-LLaVA-7B'
cache_dir = 'cache_dir'
device = 'cuda'
load_4bit, load_8bit = True, False
model_name = 'Video-LLaVA'
print("LLaVA載入模型") 
tokenizer, LLaVA_model, processor, _ = load_pretrained_model(model_path, None, model_name, load_8bit, load_4bit, device=device, cache_dir=cache_dir)
print("LLaVA結束載入")

LLaVA開始
LLaVA載入模型


Loading checkpoint shards: 100%|██████████| 2/2 [03:55<00:00, 117.74s/it]
Some weights of the model checkpoint at LanguageBind/Video-LLaVA-7B were not used when initializing LlavaLlamaForCausalLM: ['model.video_tower.video_tower.encoder.layers.17.temporal_attn.out_proj.weight', 'model.video_tower.video_tower.encoder.layers.2.layer_norm1.weight', 'model.video_tower.video_tower.encoder.layers.17.mlp.fc2.bias', 'model.video_tower.video_tower.encoder.layers.4.temporal_layer_norm1.weight', 'model.video_tower.video_tower.encoder.layers.5.temporal_attn.v_proj.bias', 'model.video_tower.video_tower.encoder.layers.6.temporal_attn.v_proj.weight', 'model.video_tower.video_tower.encoder.layers.2.temporal_attn.q_proj.bias', 'model.image_tower.image_tower.encoder.layers.13.layer_norm1.bias', 'model.video_tower.video_tower.encoder.layers.23.self_attn.out_proj.bias', 'model.video_tower.video_tower.encoder.layers.6.self_attn.q_proj.bias', 'model.video_tower.video_tower.encoder.layers.2.self_attn.q_proj.

LLaVA結束載入


In [None]:
num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item()
#print("你 prompt 中的 <image> token 數量 =", num_image_tokens)
#print(projected.shape)
#print(list[projected])
with torch.inference_mode():
    output_ids = LLaVA_model.generate(
        input_ids=input_ids,
        images=video_tensor,  
        do_sample=True,
        temperature=0.1,
        max_new_tokens=1024,
        use_cache=True,
        stopping_criteria=[stopping_criteria],
    )

output_text = tokenizer.decode(output_ids[0, input_ids.shape[1]:], skip_special_tokens=True)
print("LLaVA 回答：", output_text)

in tower_videos.shape torch.Size([16, 3, 224, 224])


RuntimeError: The size of tensor a (257) must match the size of tensor b (577) at non-singleton dimension 1

In [10]:
print(LLaVA_model.model.mm_projector[0]) 

Linear4bit(in_features=1024, out_features=4096, bias=True)


In [11]:
prompt = "This is a sign language video. Please translate it into English."
conv = conv_templates["llava_v1"].copy()
conv.messages = []  
conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + '\n' + prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

In [64]:
video_tensor = (torch.rand(1, 3, 16, 336, 336) * 255).clamp(0, 255).float().to("cuda")
print(video_tensor.shape)

torch.Size([1, 3, 16, 336, 336])


In [70]:
from transformers import XCLIPModel, XCLIPProcessor, CLIPVisionConfig
class XCLIPVideoTower(nn.Module):
    def __init__(self, video_tower_path, args=None, delay_load=False):
        super().__init__()
        self.is_loaded = False if delay_load else True
        self.video_tower_path = video_tower_path
        
        if not delay_load:
            # 直接加载模型
            self.video_tower = XCLIPModel.from_pretrained(video_tower_path).half().cuda()
            self.select_layer = -2
            self.select_feature = 'patch'
            
    def load_model(self):
        if not self.is_loaded:
            self.video_tower = XCLIPModel.from_pretrained(self.video_tower_path).half().cuda()
            self.is_loaded = True
            
    def feature_select(self, video_forward_outs):
        video_features = video_forward_outs.hidden_states[self.select_layer]
        if self.select_feature == 'patch':
            video_features = video_features[:, 1:]
        elif self.select_feature == 'cls_patch':
            video_features = video_features
        else:
            raise ValueError(f'Unexpected select feature: {self.select_feature}')
        return video_features
        
    def forward(self, videos, **kwargs):
        if not self.is_loaded:
            self.load_model()
        videos = videos.half()
        # 处理视频维度 [B, C, T, H, W] -> [B*T, C, H, W]
        b, c, t, h, w = videos.shape
        videos = videos.transpose(1, 2).reshape(-1, c, h, w)
        
        print("in tower_videos.shape", videos.shape)
        # 前向传播
        video_forward_outs = self.video_tower.vision_model(
            videos,
            output_hidden_states=True
        )
        
        # 重塑输出 [B*T, L, C] -> [B, T, L, C]
        video_features = self.feature_select(video_forward_outs)
        _, l, c = video_features.shape
        print(video_features.shape)
        video_features = video_features.reshape(b, t, l, c)
        
        return video_features

    @property
    def dummy_feature(self):
        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)

    @property
    def device(self):
        return next(self.video_tower.parameters()).device if self.is_loaded else None

    @property
    def dtype(self):
        return next(self.video_tower.parameters()).dtype if self.is_loaded else None

    @property
    def config(self):
        if self.is_loaded:
            return self.video_tower.config
        else:
            return self.cfg_only

    @property
    def hidden_size(self):
        return self.config.hidden_size

    @property
    def num_patches(self):
        return (self.config.image_size // self.config.patch_size) ** 2

In [23]:
llava_video_tower = LLaVA_model.model.video_tower 

In [71]:
# 创建参数对象
model_args = type('Args', (), {
    'video_tower': 'microsoft/xclip-large-patch14-16-frames',
    'mm_vision_select_layer': -2,
    'mm_vision_select_feature': 'patch',
})()
# 初始化视频塔
video_tower = XCLIPVideoTower(model_args.video_tower, model_args)

# 设置到 LLaVA 模型
LLaVA_model.model.video_tower = video_tower.half().cuda()

`id2label` is found in both `text_config_dict` and `text_config` but with different values. The value `text_config_dict["id2label"]` will be used instead.


In [30]:
# 檢查模型配置
print("Video tower:",LLaVA_model.get_video_tower())
print("Hidden size:", LLaVA_model.config.mm_hidden_size)
print("Projector type:", LLaVA_model.config.mm_projector_type)

Video tower: XCLIPVideoTower(
  (video_tower): XCLIPModel(
    (text_model): XCLIPTextTransformer(
      (embeddings): XCLIPTextEmbeddings(
        (token_embedding): Embedding(49408, 768)
        (position_embedding): Embedding(77, 768)
      )
      (encoder): XCLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x XCLIPEncoderLayer(
            (self_attn): XCLIPAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): XCLIPMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, ou

In [52]:
video_tensor.shape

torch.Size([1, 3, 16, 336, 336])

In [80]:
num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item()
#print("你 prompt 中的 <image> token 數量 =", num_image_tokens)
#print(projected.shape)
#print(list[projected])
with torch.inference_mode():
    output_ids = LLaVA_model.generate(
        input_ids=input_ids,
        images=pixel_values,  
        do_sample=True,
        temperature=0.1,
        max_new_tokens=1024,
        use_cache=True,
        stopping_criteria=[stopping_criteria],
    )

output_text = tokenizer.decode(output_ids[0, input_ids.shape[1]:], skip_special_tokens=True)
print("LLaVA 回答：", output_text)

in tower_videos.shape torch.Size([16, 3, 336, 336])
torch.Size([16, 576, 1024])
in encode_videos video_features= tensor([[[[ 0.3623, -0.7271,  1.0098,  ...,  0.9409,  0.1445,  0.5776],
          [-0.0818, -0.8091,  0.9385,  ...,  1.1729,  0.3562,  0.5205],
          [-0.6396, -0.9053,  1.2012,  ...,  0.5811, -0.5024, -0.3679],
          ...,
          [-0.3066, -0.6606,  0.6318,  ...,  0.9624,  0.9106,  0.8086],
          [ 0.2788, -0.2385,  1.5967,  ...,  0.3689,  0.7212,  0.1658],
          [ 0.3850, -0.7690,  1.1074,  ...,  0.8979,  0.2253,  0.5488]],

         [[-0.1271, -1.0342,  1.1426,  ...,  0.6133,  0.5542,  0.9150],
          [-0.1790, -0.9780,  0.9219,  ...,  0.9756,  0.2280,  0.8452],
          [-0.7939, -0.9902,  0.7412,  ...,  0.3032, -0.0082,  0.2034],
          ...,
          [-0.1885, -0.4106,  0.9741,  ...,  0.5967,  0.6279,  0.4541],
          [ 0.1301, -0.7227,  0.7188,  ...,  0.8760,  0.5317,  0.6445],
          [ 0.2417, -0.4995,  0.9238,  ...,  0.6260,  0.0552,  