In [1]:
import cv2
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import networkx as nx
import numpy as np
import supervision as sv
import math
import time
import requests
import torch
import os
import torchreid
import json
import base64
import io
import shutil
import logging
logging.getLogger("ultralytics").setLevel(logging.ERROR)


from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from ultralytics import YOLO
from transformers import pipeline
from PIL import Image
from qwen_vl_utils import process_vision_info
from our_utils import *
from tqdm import tqdm


def main():

    spacing = 10
    eval_frames = 100
    
    video_metadata = {}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    qwen = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct",torch_dtype=torch.bfloat16,attn_implementation="flash_attention_2",device_map="cuda:0",)
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
    qwen.eval()
    
    pose = YOLO("yolo11m-pose.pt", 0.15)
    model = YOLO("yolov10m.pt")
    tracker = sv.ByteTrack(track_activation_threshold=0.1)
    palette = sv.ColorPalette.DEFAULT
    
    #cap = cv2.VideoCapture("input.mp4")
    #video_path = "/gpfs/projects/CascanteBonillaGroup/datasets/walking_tours/downloads/Iquitos_Peru - youtube video 0WpiskskL6Y [0WpiskskL6Y]/chunk_001.mp4"
    #video_path = "/gpfs/projects/CascanteBonillaGroup/datasets/walking_tours/downloads/London_United Kingdom - LONDON 4K Walking Tour (UK) - 4h Tour with Captions Immersive Sound [4K Ultra HD60fps] [8WlUiln-VeY]/chunk_002.mp4"
    video_path = "/gpfs/projects/CascanteBonillaGroup/datasets/walking_tours/downloads/Dubai_UAE - youtube video AjwcqYZ6cIw [AjwcqYZ6cIw]/chunk_001.mp4"
    
    country = video_path.split('/')[7].split('_')[0] + '_' + video_path.split('/')[8].split('_')[1][:-4] 
    video_name = video_path.split('/')[7]
    
    video_metadata['path'] = video_path
    
    cap = cv2.VideoCapture(video_path)#"/gpfs/projects/CascanteBonillaGroup/datasets/walking_tours/downloads/Dubai_UAE - youtube video AjwcqYZ6cIw [AjwcqYZ6cIw]/chunk_001.mp4")
    
    os.makedirs(video_name, exist_ok=True)
    dir_path = video_name+'/video'
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.makedirs(dir_path, exist_ok=True)
    
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = int(cap.get(cv2.CAP_PROP_FPS)) 

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    secs = total_frames // fps
    video_metadata['fps'] = fps
    video_metadata['original_total_frames'] = total_frames
    video_metadata['original_secs'] = secs
    total_frames = eval_frames#int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_metadata['total_frames'] = (total_frames // spacing) + 1
    video_metadata['spacing'] = spacing
    video_metadata['frames'] = []
    
    for count_frame in tqdm(range(total_frames), desc="Processing video"):
        ret, frame = cap.read()
    
        if count_frame % spacing != 0:
            continue
        
        if not ret:
            print("Video ended, closing...")
            cap.release()
            break
    
        cv2.imwrite(video_name+'/video/'+str(count_frame//spacing)+'.jpg', frame)
    
        results = model(frame, verbose=False)[0]  # Using predict() for latest ultralytics
        detections = sv.Detections.from_ultralytics(results)
        detections = detections[detections.confidence > 0.2]  # Filter low confidence
        detections = detections[detections.class_id == 0]
        tracks = tracker.update_with_detections(detections)#tracker.update(detections=detections)
    
        coord = np.zeros((1275, frame.shape[1],3)).astype(np.uint8)+255
        edges = []
        list_features = []
        
        c_frame = int(count_frame//spacing)
        current_frame_info = {}    
        current_frame_info['frame_id'] = c_frame
        current_frame_info['detections'] = []
    
        for int_id, track_i in enumerate(tracks):
            
            detection = {}
            track_id = int(track_i[4])
            o1_x1, o1_y1, o1_x2, o1_y2 = map(int, track_i[0])
            o1_mid = ((o1_x1+o1_x2)//2, (o1_y1+o1_y2)//2)
            
            opencv_frame = frame[int(o1_y1):int(o1_y2),int(o1_x1):int(o1_x2)]
            pil_frame = opencv_to_pil(opencv_frame)
            prob_male, answer_male = vqa_yes_prob(qwen, processor, pil_frame, 'Is the person a male?')
            prob_female, answer_female = vqa_yes_prob(qwen, processor, pil_frame, 'Is the person a female?')
            prob_child, answer_child = vqa_yes_prob(qwen, processor, pil_frame, 'Is the person a child?')
            prob_nbin, answer_nbin = vqa_yes_prob(qwen, processor, pil_frame, 'Is the person non-binary?')
    
            person = np.array([prob_male, prob_female, prob_child, prob_nbin])
            sex = 'unknown'
            if np.argmax(person)   == 0:
                sex='male'
            elif np.argmax(person) == 1:
                sex='female'
            elif np.argmax(person) == 2:
                sex='child'
            elif np.argmax(person) == 3:
                sex='non binary'
                
            pose_result = pose(opencv_frame)
            direction, visible   = 'unknown', 'unknown'
            
            detection['body_parts'] = None
            detection['conf_body_parts'] = None
            detection['sex'] = sex
            detection['track_id'] = track_id
            
            if len(pose_result[0].boxes) > 0:
    
                confs = pose_result[0].boxes.conf.cpu().numpy()
                best_idx = confs.argmax()
                
                highest_conf_result = pose_result[0][best_idx:best_idx+1]            
                confidence = highest_conf_result.keypoints.conf
                
                values = highest_conf_result.keypoints.conf>0.3
                parts  = ["Nose", "Left Eye", "Right Eye", "Left Ear", "Right Ear", "Left Shoulder", "Right Shoulder","Left Elbow","Right Elbow","Left Wrist","Right Wrist","Left Hip","Right Hip","Left Knee","Right Knee","Left Ankle","Right Ankle"]
                n_val = values.cpu().detach().numpy()[0]
                key_values = highest_conf_result.keypoints.conf.cpu().detach().numpy()[0]
                counts_points_body = np.sum(np.array(n_val[5:]))
    
                if counts_points_body:
                    visible = 'not occluded'
                else:
                    visible = 'occluded'

                if n_val[0] == True and n_val[1] == True and n_val[2] == True and n_val[3] == True and n_val[4] == True:
                    direction = 'front'
                elif n_val[0] == True and n_val[1] == True and n_val[2] == True and n_val[3] == True and n_val[4] == False:
                    direction = 'front right'
                elif n_val[0] == True and n_val[1] == True and n_val[2] == False and n_val[3] == True and n_val[4] == False:
                    direction = 'front rright'
                elif n_val[0] == True and n_val[1] == True and n_val[2] == True and n_val[3] == False and n_val[4] == True:
                    direction = 'front left'
                elif n_val[0] == True and n_val[1] == False and n_val[2] == True and n_val[3] == False and n_val[4] == True:
                    direction = 'front lleft'
                elif n_val[0] == False and n_val[1] == False and n_val[2] == False and n_val[3] == True and n_val[4] == True:
                    direction = 'back'
                elif n_val[0] == False and n_val[1] == False and n_val[2] == True and n_val[3] == True and n_val[4] == True:
                    direction = 'back right'
                elif n_val[0] == False and n_val[1] == True and n_val[2] == False and n_val[3] == True and n_val[4] == True:
                    direction = 'back left'

                detection['body_parts'] = parts
                detection['conf_body_parts'] = key_values.tolist()
            detection['direction'] = direction
            detection['visible'] = visible
            detection['bbox'] = [o1_x1, o1_y1, o1_x2, o1_y2]
            current_frame_info['detections'].append(detection)
        video_metadata['frames'].append(current_frame_info)
    
    with open(f"{video_name}/{country}.json", "w") as fp:
        json.dump(video_metadata, fp, indent=4)
    
    cap.release()
    
if __name__=='__main__':
    main()

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:19<00:00,  3.83s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



0: 640x256 1 person, 51.4ms
Speed: 1.1ms preprocess, 51.4ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 256)

0: 640x192 4 persons, 51.6ms
Speed: 1.1ms preprocess, 51.6ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 192)

0: 640x256 1 person, 10.9ms
Speed: 1.1ms preprocess, 10.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x160 (no detections), 51.1ms
Speed: 0.6ms preprocess, 51.1ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 160)

0: 640x160 1 person, 10.2ms
Speed: 0.6ms preprocess, 10.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 160)

0: 640x192 2 persons, 10.9ms
Speed: 0.7ms preprocess, 10.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x160 1 person, 10.9ms
Speed: 0.9ms preprocess, 10.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 160)

0: 640x224 1 person, 50.9ms
Speed: 0.7ms preprocess, 50.9ms inference, 1.3ms postprocess per image at shape 

Processing video:   1%|█▎                                                                                                                               | 1/100 [00:17<29:22, 17.80s/it]


0: 640x192 2 persons, 11.1ms
Speed: 1.1ms preprocess, 11.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 192)

0: 640x256 1 person, 10.9ms
Speed: 1.2ms preprocess, 10.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 256)

0: 640x256 1 person, 10.5ms
Speed: 0.8ms preprocess, 10.5ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x224 1 person, 10.9ms
Speed: 0.7ms preprocess, 10.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x256 1 person, 10.6ms
Speed: 0.8ms preprocess, 10.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 256)

0: 640x128 (no detections), 51.2ms
Speed: 0.6ms preprocess, 51.2ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 128)

0: 640x192 2 persons, 10.9ms
Speed: 0.7ms preprocess, 10.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)


Processing video:  11%|██████████████                                                                                                                  | 11/100 [00:27<03:03,  2.06s/it]


0: 640x256 2 persons, 11.3ms
Speed: 1.4ms preprocess, 11.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x256 1 person, 11.0ms
Speed: 1.0ms preprocess, 11.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x224 1 person, 10.8ms
Speed: 2.0ms preprocess, 10.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 2 persons, 11.2ms
Speed: 1.1ms preprocess, 11.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 1 person, 10.9ms
Speed: 0.7ms preprocess, 10.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x320 1 person, 52.1ms
Speed: 1.0ms preprocess, 52.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 320)

0: 640x128 (no detections), 10.9ms
Speed: 0.6ms preprocess, 10.9ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 128)

0: 640x160 (no detections), 10.8ms
Speed: 0.6ms preprocess, 10.8ms inference, 0.6ms postprocess per image at

Processing video:  21%|██████████████████████████▉                                                                                                     | 21/100 [00:38<02:00,  1.52s/it]


0: 640x256 1 person, 11.3ms
Speed: 1.2ms preprocess, 11.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x320 5 persons, 11.4ms
Speed: 1.4ms preprocess, 11.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 320)

0: 640x224 1 person, 10.9ms
Speed: 0.8ms preprocess, 10.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 2 persons, 11.0ms
Speed: 1.0ms preprocess, 11.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 person, 10.3ms
Speed: 0.8ms preprocess, 10.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x256 1 person, 11.0ms
Speed: 0.8ms preprocess, 11.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 256)


Processing video:  31%|███████████████████████████████████████▋                                                                                        | 31/100 [00:47<01:23,  1.22s/it]


0: 640x288 3 persons, 11.7ms
Speed: 1.5ms preprocess, 11.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 288)

0: 640x224 1 person, 11.0ms
Speed: 1.1ms preprocess, 11.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x256 1 person, 10.8ms
Speed: 1.1ms preprocess, 10.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 256)

0: 640x192 2 persons, 11.3ms
Speed: 1.1ms preprocess, 11.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 1 person, 10.7ms
Speed: 0.7ms preprocess, 10.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 person, 10.1ms
Speed: 0.7ms preprocess, 10.1ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)


Processing video:  41%|████████████████████████████████████████████████████▍                                                                           | 41/100 [00:55<01:03,  1.07s/it]


0: 640x352 1 person, 52.7ms
Speed: 1.6ms preprocess, 52.7ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 352)

0: 640x256 1 person, 10.9ms
Speed: 1.0ms preprocess, 10.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x224 1 person, 11.1ms
Speed: 1.0ms preprocess, 11.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 person, 10.8ms
Speed: 2.2ms preprocess, 10.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 1 person, 10.8ms
Speed: 0.7ms preprocess, 10.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x192 1 person, 10.3ms
Speed: 0.7ms preprocess, 10.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x256 1 person, 10.7ms
Speed: 0.7ms preprocess, 10.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 256)


Processing video:  51%|█████████████████████████████████████████████████████████████████▎                                                              | 51/100 [01:05<00:51,  1.06s/it]


0: 640x224 1 person, 11.1ms
Speed: 1.2ms preprocess, 11.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 person, 10.9ms
Speed: 1.1ms preprocess, 10.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x288 1 person, 11.2ms
Speed: 1.5ms preprocess, 11.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 288)

0: 640x192 1 person, 11.0ms
Speed: 0.9ms preprocess, 11.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 1 person, 11.1ms
Speed: 0.8ms preprocess, 11.1ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 1 person, 10.8ms
Speed: 0.7ms preprocess, 10.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x288 1 person, 10.8ms
Speed: 0.9ms preprocess, 10.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 288)


Processing video:  61%|██████████████████████████████████████████████████████████████████████████████                                                  | 61/100 [01:15<00:40,  1.03s/it]


0: 640x224 1 person, 11.2ms
Speed: 1.2ms preprocess, 11.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 2 persons, 10.6ms
Speed: 1.3ms preprocess, 10.6ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 person, 10.7ms
Speed: 1.2ms preprocess, 10.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 2 persons, 10.9ms
Speed: 0.7ms preprocess, 10.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x192 1 person, 10.5ms
Speed: 0.7ms preprocess, 10.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 1 person, 10.7ms
Speed: 0.8ms preprocess, 10.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 1 person, 11.4ms
Speed: 0.7ms preprocess, 11.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 (no detections), 10.6ms
Speed: 0.7ms preprocess, 10.6ms inference, 0.6ms postprocess per image at shape 

Processing video:  71%|██████████████████████████████████████████████████████████████████████████████████████████▉                                     | 71/100 [01:26<00:30,  1.06s/it]


0: 640x224 2 persons, 10.7ms
Speed: 1.2ms preprocess, 10.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x160 1 person, 11.3ms
Speed: 0.7ms preprocess, 11.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 160)

0: 640x320 1 person, 11.3ms
Speed: 1.5ms preprocess, 11.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 320)

0: 640x224 3 persons, 11.5ms
Speed: 2.1ms preprocess, 11.5ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 1 person, 10.8ms
Speed: 0.7ms preprocess, 10.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 (no detections), 10.7ms
Speed: 0.8ms preprocess, 10.7ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 224)


Processing video:  81%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 81/100 [01:35<00:18,  1.01it/s]


0: 640x160 1 person, 11.1ms
Speed: 0.9ms preprocess, 11.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 160)

0: 640x256 1 person, 11.2ms
Speed: 0.9ms preprocess, 11.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x256 2 persons, 10.9ms
Speed: 0.9ms preprocess, 10.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 256)

0: 640x224 1 person, 10.6ms
Speed: 0.8ms preprocess, 10.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 person, 10.8ms
Speed: 1.3ms preprocess, 10.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 224)


Processing video: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:42<00:00,  1.02s/it]
