# Data

In [1]:
import glob
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
# pd.reset_option('all')

data_dir = "data/快递视频1225"
video_files = sorted(glob.glob(os.path.join(data_dir, '*.mp4')) + glob.glob(os.path.join(data_dir, '*.mov')), key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))

label_path = os.path.join(data_dir, 'description.xlsx')
df_label = pd.read_excel(label_path)
df_label['scenario'] = df_label['scenario'].apply(lambda x: x.strip('.').strip())
df_label.head()

Unnamed: 0,filename,scenario,delivery service,description
0,1.mp4,Deliver a package/The package was stolen,FedEx,A FedEx delivery driver was delivering a package when suddenly someone appeared and snatched the package away.
1,2.mp4,Damaging a package violently,Unknown,"A man fell while delivering packages, and he was violently damaging the packages."
2,3.mp4,The delivery driver approached,Unknown,The delivery driver spotted a brown bear and approached to take a closer look.
3,4.mov,Take the box,Unknown,A man took the box from the doorstep.
4,5.mov,Deliver a package,Unknown,"A male delivery driver delivered the package, and the woman signed for it."


# Qwen2-VL

In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import os

model_dir = "/home/ec2-user/SageMaker/efs/Models/Qwen2-VL-7B-Instruct"
# model_dir = "/home/ec2-user/SageMaker/efs/Models/Qwen2-VL-72B-Instruct-AWQ"
# model_dir = "/home/ec2-user/SageMaker/efs/Models/Qwen2-VL-72B-Instruct"

model_name = model_dir.rsplit('/', 1)[1]
if '7B' in model_name:
    os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# # default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     model_dir, torch_dtype="auto", device_map="auto"
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processer
processor = AutoProcessor.from_pretrained(model_dir)

  from .autonotebook import tqdm as notebook_tqdm
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


In [3]:
# # The default range for the number of visual tokens per image in the model is 4-16384.
# # You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# # min_pixels = 256*28*28
# # max_pixels = 1280*28*28
# # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

# messages = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "image",
#                 "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
#             },
#             {"type": "text", "text": "Describe this image."},
#         ],
#     }
# ]

# # Preparation for inference
# text = processor.apply_chat_template(
#     messages, tokenize=False, add_generation_prompt=True
# )
# image_inputs, video_inputs = process_vision_info(messages)
# inputs = processor(
#     text=[text],
#     images=image_inputs,
#     videos=video_inputs,
#     padding=True,
#     return_tensors="pt",
# )
# inputs = inputs.to("cuda")

# # Inference: Generation of the output
# generated_ids = model.generate(**inputs, max_new_tokens=128)
# generated_ids_trimmed = [
#     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
# ]
# output_text = processor.batch_decode(
#     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
# )
# print(output_text)

In [3]:
import json

# # Messages containing a images list as a video and a text query
# messages = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "video",
#                 "video": [
#                     "file:///path/to/frame1.jpg",
#                     "file:///path/to/frame2.jpg",
#                     "file:///path/to/frame3.jpg",
#                     "file:///path/to/frame4.jpg",
#                 ],
#             },
#             {"type": "text", "text": "Describe this video."},
#         ],
#     }
# ]

def convert_to_dict(text_item):
    if text_item.startswith('```'):
        # Handle markdown code block format
        # Remove ```json\n at start and \n``` at end
        json_str = text_item.replace('```json\n', '').replace('\n```', '')
    else:
        # Handle double curly brace format
        json_str = text_item.replace('{{', '{').replace('}}', '}')
    
    return json.loads(json_str)

def identify_event(video_path):

    prompt = """\
This is the surveillance camera footage from home delivery scenarios. I need you to analyze the content carefully and provide a structured analysis in the following format:

scenario: identify the main events in the video, select the most relevant label in [Deliver a package, The package was stolen, Damaging a package violently, The delivery driver approached, Take the box, Violently delivering packages, Place the box, Deliver food, Driving a delivery van]
delivery service: identify the delivery company if visible, e.g., FedEx, UPS, Amazon, 美团外卖, SAGAWA, DHL, or mark as “Unknown” if unclear
description: provide a brief, one sentence description of what happens in the video, including key actions and participants **in English**

Return your analysis in strict JSON format as follows:
{{
    "scenario": "main event or situation labels",
    "delivery_service": "company name or Unknown",
    "description": "brief one-sentence description in English",
}}
Please ensure the output is valid JSON with these exact keys.
"""

    # Messages containing a local video path and a text query
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "min_pixes": 256 * 28 * 28,
                    "max_pixels": 640 * 28 * 28,
                    "fps": 1.0,
                    "max_frames": 32
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]

    # Messages containing a video url and a text query
    # messages = [
    #     {
    #         "role": "user",
    #         "content": [
    #             {
    #                 "type": "video",
    #                 "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4",
    #             },
    #             {"type": "text", "text": "Describe this video."},
    #         ],
    #     }
    # ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_inputs, video_inputs = process_vision_info(messages)
    # print(f"text: {text}")
    # print(f"image_inputs: {image_inputs}")
    # print(f"video_inputs: {type(video_inputs)},{video_inputs[0].shape}, {video_inputs}")
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    # print(output_text)

    # Remove extra curly braces and convert to proper JSON string
    # json_str = .replace('{{', '{').replace('}}', '}').replace('\n', '')

    # Parse JSON string into dictionary
    result = convert_to_dict(output_text[0]) #  json.loads(json_str)
    return result

# result = identify_event(video_files[8])
# print(f"result: {result}")

In [4]:
from tqdm import tqdm
import time

results = []
for video_file in tqdm(video_files, total=len(video_files)):
    start_time = time.time()
    try:
        result = identify_event(video_file)
    except Exception as e:
        print(f"Error: {e}")
        print(f"result: {result}")
        
    time_cost = time.time() - start_time
    result['time_cost'] = round(time_cost, 3)
    results.append(result)
    
df_pred = pd.DataFrame(results)
df_pred.to_csv(f"./outputs/result_{model_name}.csv", index=False)
df_pred.head()

100%|██████████| 12/12 [02:35<00:00, 12.99s/it]


Unnamed: 0,scenario,delivery_service,description,time_cost
0,The package was stolen,Unknown,"A delivery driver approaches a house, but the package is stolen before he can deliver it.",4.941
1,Violently delivering packages,Unknown,A delivery driver暴力地将包裹扔在楼梯上，然后用脚踢，最后将包裹踢下楼梯。,12.888
2,The delivery driver approached,Unknown,"A delivery driver approaches a house, possibly to deliver a package.",4.284
3,The package was stolen,Unknown,"A man is seen running away from a package on the ground, indicating that the package was stolen.",4.829
4,Deliver a package,Unknown,A delivery person in a red uniform and cap delivers packages to a woman at a house with a green gate. The woman signs for the delivery on a tablet.,46.112


# evaluate results

In [5]:
df_pred = df_pred.add_suffix(f'_pred')
df_merge = pd.concat([df_label, df_pred], axis=1)
columns = ['filename', 'scenario', 'scenario_pred', 'delivery service', 'delivery_service_pred', 
           'description', 'description_pred', 'time_cost_pred']
df_merge = df_merge[columns]
df_merge.head()

Unnamed: 0,filename,scenario,scenario_pred,delivery service,delivery_service_pred,description,description_pred,time_cost_pred
0,1.mp4,Deliver a package/The package was stolen,The package was stolen,FedEx,Unknown,A FedEx delivery driver was delivering a package when suddenly someone appeared and snatched the package away.,"A delivery driver approaches a house, but the package is stolen before he can deliver it.",4.941
1,2.mp4,Damaging a package violently,Violently delivering packages,Unknown,Unknown,"A man fell while delivering packages, and he was violently damaging the packages.",A delivery driver暴力地将包裹扔在楼梯上，然后用脚踢，最后将包裹踢下楼梯。,12.888
2,3.mp4,The delivery driver approached,The delivery driver approached,Unknown,Unknown,The delivery driver spotted a brown bear and approached to take a closer look.,"A delivery driver approaches a house, possibly to deliver a package.",4.284
3,4.mov,Take the box,The package was stolen,Unknown,Unknown,A man took the box from the doorstep.,"A man is seen running away from a package on the ground, indicating that the package was stolen.",4.829
4,5.mov,Deliver a package,Deliver a package,Unknown,Unknown,"A male delivery driver delivered the package, and the woman signed for it.",A delivery person in a red uniform and cap delivers packages to a woman at a house with a green gate. The woman signs for the delivery on a tablet.,46.112


In [6]:
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

df_merge['scenario_correct'] = (df_merge['scenario'] == df_merge['scenario_pred']).astype(int)
df_merge['delivery_service_correct'] = (df_merge['delivery service'] == df_merge['delivery_service_pred']).astype(int)

bleu_smoothing = SmoothingFunction().method4
rouge = Rouge()

for index, row in df_merge.iterrows():
    reference = row['description']
    hypothesis = row['description_pred']

    df_merge.at[index, 'description_bleu'] = sentence_bleu([reference], hypothesis, smoothing_function=bleu_smoothing)
    rouge_scores = rouge.get_scores(hypothesis, reference)[0]
    df_merge.at[index, 'description_rouge'] = rouge_scores['rouge-l']['f']

df_merge.to_csv(f"outputs/merge_result_{model_name}.csv", index=False)
df_merge

Unnamed: 0,filename,scenario,scenario_pred,delivery service,delivery_service_pred,description,description_pred,time_cost_pred,scenario_correct,delivery_service_correct,description_bleu,description_rouge
0,1.mp4,Deliver a package/The package was stolen,The package was stolen,FedEx,Unknown,A FedEx delivery driver was delivering a package when suddenly someone appeared and snatched the package away.,"A delivery driver approaches a house, but the package is stolen before he can deliver it.",4.941,0,0,0.416462,0.375
1,2.mp4,Damaging a package violently,Violently delivering packages,Unknown,Unknown,"A man fell while delivering packages, and he was violently damaging the packages.",A delivery driver暴力地将包裹扔在楼梯上，然后用脚踢，最后将包裹踢下楼梯。,12.888,0,1,0.090075,0.125
2,3.mp4,The delivery driver approached,The delivery driver approached,Unknown,Unknown,The delivery driver spotted a brown bear and approached to take a closer look.,"A delivery driver approaches a house, possibly to deliver a package.",4.284,1,1,0.442178,0.347826
3,4.mov,Take the box,The package was stolen,Unknown,Unknown,A man took the box from the doorstep.,"A man is seen running away from a package on the ground, indicating that the package was stolen.",4.829,0,1,0.190022,0.347826
4,5.mov,Deliver a package,Deliver a package,Unknown,Unknown,"A male delivery driver delivered the package, and the woman signed for it.",A delivery person in a red uniform and cap delivers packages to a woman at a house with a green gate. The woman signs for the delivery on a tablet.,46.112,1,1,0.357847,0.277778
5,6.mov,Violently delivering packages,Deliver a package,Unknown,Unknown,Two delivery drivers are violently handling packages.,"A package is seen rolling on the ground, and a person wearing sandals approaches it.",4.227,0,1,0.209304,0.0
6,7.mov,Place the box,Violently delivering packages,Unknown,Unknown,A man placed a box and a potted plant at the doorstep.,"A man in a red shirt and khaki pants is seen placing a box and a potted plant on the porch, then using a broom to sweep the area.",6.96,0,1,0.320479,0.457143
7,8.mov,Deliver food,Deliver a package,美团外卖,Unknown,A Meituan food delivery driver left the order at the doorstep.,"A person is seen walking down a hallway, carrying a package and placing it on the floor.",5.914,0,0,0.113354,0.153846
8,9.mp4,Deliver a package,Deliver a package,SAGAWA,SAGAWA,A woman is delivering a package.,A delivery person from SAGAWA delivers a package to a customer and receives payment.,49.145,1,1,0.218649,0.315789
9,10.mp4,Driving a delivery van,Deliver a package,UPS,UPS,A man is driving a UPS delivery van.,"A man is seen riding a UPS electric tricycle in a parking lot, suggesting he is delivering a package.",5.576,0,1,0.245388,0.416667


In [7]:
df_eval = df_merge[['scenario_correct', 'delivery_service_correct', 'description_bleu', 'description_rouge']].mean().round(4)
df_eval.to_csv(f"outputs/eval_result_{model_name}.csv")
df_eval

scenario_correct            0.3333
delivery_service_correct    0.8333
description_bleu            0.2659
description_rouge           0.2820
dtype: float64