In [None]:
!mkdir video_frames
!mkdir  CLIP_features

In [2]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda:7"

tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4v-9b", trust_remote_code=True)

query = """You are a highly advanced assistant analyzing video frames and generating precise and contextually accurate descriptions of the actions occurring in each frame. Your task is to observe each video frame and produce a detailed caption that describes the current action taking place. Each description should be clear, specific, and concise, capturing the essence of the action accurately.

Guidelines:

Detail-Oriented Descriptions: Focus on the specific action being performed in the frame. Avoid general or vague descriptions. The static, non-interactive, background objects or scenes should NOT be your focus.
Conciseness: Provide precise descriptions in one or two sentences.
Consistency: Maintain a consistent style and level of detail throughout the video sequence."""


image = Image.open("cutting.jpg").convert('RGB')
inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "content": query}],
                                       add_generation_prompt=True, tokenize=True, return_tensors="pt",
                                       return_dict=True)  # chat mode




tokenizer_config.json:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

tokenization_chatglm.py:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/glm-4v-9b:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:

inputs = inputs.to(device)
model = AutoModelForCausalLM.from_pretrained(
    "THUDM/glm-4v-9b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to(device).eval()

gen_kwargs = {"max_length": 256, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0]))

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Frame 1: A person's hands are shown expertly chopping green herbs with a chef's knife on a wooden cutting board. The chopping motion is swift and precise, with the knife's blade slicing through the herbs, leaving behind finely chopped pieces. <|endoftext|>


In [16]:
import cv2
import os
import pickle
from tqdm import tqdm


with open("captions.txt", "w") as f:
  for i, file in enumerate(tqdm(os.listdir("video_frames"))):
      inputs = tokenizer.apply_chat_template([{"role": "user", "image": Image.open("video_frames/" + file).convert('RGB'), "content": query}],
                                            add_generation_prompt=True, tokenize=True, return_tensors="pt",
                                              return_dict=True)  # chat mode
      inputs = inputs.to(device)
      with torch.no_grad():
          outputs = model.generate(**inputs, **gen_kwargs)
          outputs = outputs[:, inputs['input_ids'].shape[1]:]
          caption = tokenizer.decode(outputs[0])
          print(caption)
          f.write(f"{i}\t" + caption + "\n")

In [6]:


def video_to_frames(video_path, output_folder):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the video file
    cap = cv2.VideoCapture(video_path)
    
    # Check if the video file opened successfully
    if not cap.isOpened():
        print("Error: Could not open video.")
        return
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"Total frames in the video: {frame_count}")

    frame_number = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Save the frame as an image file
        frame_filename = os.path.join(output_folder, f"frame_{frame_number:04d}.jpg")
        cv2.imwrite(frame_filename, frame)
        frame_number += 1

    # Release the video capture object
    cap.release()
    print(f"Extracted {frame_number} frames to {output_folder}")

if __name__ == "__main__":
    # Path to the video file
    video_path = "/home/xinyu/ActionSegmentation/cam01_P03_cereals.avi"
    
    # Directory to save the extracted frames
    output_folder = "video_frames"
    
    # Extract frames from the video
    video_to_frames(video_path, output_folder)


Total frames in the video: 836
Extracted 834 frames to video_frames


In [None]:
import cv2
import numpy as np
import torch
import clip
from PIL import Image
from sklearn.decomposition import PCA


from math import trunc


captions = []
with open("captions.txt", "r") as f:
    for line in f.readlines():
        line = line.replace("Frame 1", "")
        line = line.replace("<|endoftext|>", "")
        captions.append(line.strip())   

device = "cuda:6" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()
text_inputs = torch.cat([clip.tokenize(c, truncate=True) for c in captions]).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)

text_features = text_features.cpu().numpy()

name = 'P03_cam01_P03_cereals'
pca = PCA(n_components=64)
reduced_features = pca.fit_transform(text_features)
print("reduced_features.shape: ", reduced_features.shape)  
path = f'./CLIP_features/{name}.txt'
np.savetxt(path, reduced_features)