# Data Analysis

In [3]:
import os
import json
import pandas as pd

data_dir = "data/hualai_sft_data/"
video_dir = "data/hualai_sft_data/video"
input_file = os.path.join(data_dir, 'output_0314.json')

In [4]:
with open(input_file, 'r') as f:
    data = json.load(f)

data_list = []
for k, v in data.items():
    data_list.append(v)

df = pd.DataFrame(data_list)
df.head()

Unnamed: 0,scenario,delivery_service,description,video_id
0,Violent express delivery,Unknown,"A delivery driver drops boxes, falls, destroys...",anonymized-1.mp4
1,Encounter with wildlife,Unknown,A person is delivering a package when a deer s...,anonymized-2.mp4
2,Normal express delivery,Amazon,A delivery driver wearing an Amazon uniform de...,anonymized-3.mp4
3,Normal express delivery,Unknown,"The courier, wearing a black t-shirt with desi...",anonymized-4.mp4
4,Normal express delivery,Unknown,A person in a red jacket drops off a paper bag...,anonymized-5.mp4


# Split Train and Test set

In [7]:
import json
import random
random.seed(42)

data_path = "data/hualai_sft_data/output_0314.json"

# Load data
with open(data_path, 'r') as f:
    data = json.load(f)

# Get all keys
keys = list(data.keys())

# Randomly shuffle keys
random.shuffle(keys)

# Calculate split point
split_point = int(len(keys) * 0.8)

# Split keys into train and test sets
train_keys = keys[:split_point]
test_keys = keys[split_point:]

# Create train and test dictionaries
train_data = {k: data[k] for k in train_keys}
test_data = {k: data[k] for k in test_keys}

# Save train data
with open('data/hualai_sft_data/train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

# Save test data
with open('data/hualai_sft_data/test.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f"Total samples: {len(data)}")
print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Total samples: 101
Training samples: 80
Test samples: 21


# Data Preparation

In [None]:
import json
import os

def convert_json_format(input_file, output_file, data_source="delivery_dataset"):
    # Read the input JSON file
    with open(input_file, 'r', encoding='utf-8') as f:
        input_data = json.load(f)
    
    # Initialize the output list
    output_data = []

    prompt = """\
This is the surveillance camera footage from home delivery scenarios. I need you to analyze the content carefully and provide a structured analysis in the following format:

scenario: identify the main events in the video, select the most relevant label in [Violent express delivery, Normal express delivery, Express package theft, Encounter with wildlife, Accidental dropping of delivery items, Other abnormal behaviors]
delivery service: identify the delivery company if visible, e.g., FedEx, UPS, Amazon, 美团外卖, SAGAWA, DHL, or mark as “Unknown” if unclear
description: provide a brief, one-sentence description of what happens in the video, including key actions and participants ** in English **

Return your analysis in strict JSON format as follows:
{{
    "scenario": "main event or situation labels",
    "delivery_service": "company name or Unknown",
    "description": "brief one-sentence description in English",
}}
Please ensure the output is valid JSON with these exact keys.\
"""
    
    # Process each video entry
    for idx, (video_id, video_data) in enumerate(input_data.items(), 1):
        # Create conversation for scenario question
        answer = f"""\
{{"scenario": "{video_data["scenario"]}", \
"delivery_service": "{video_data["delivery_service"]}", \
"description": "{video_data["description"]}"\
}}\
"""
        entry = {
            "id": f"{os.path.splitext(video_id)[0]}",
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{prompt}"
                },
                {
                    "from": "gpt",
                    "value": answer
                }
            ],
            "data_source": data_source,
            "video": f"video/{video_id}"
        }
        output_data.append(entry)
        
    # Write the output JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)
    
    print(f"Converted {len(input_data)} videos into {len(output_data)} conversation entries")
    return output_data

data_dir = "data/hualai_sft_data/"
input_file = os.path.join(data_dir, 'train.json')
output_file = os.path.join(data_dir, 'train_formatted.json')

converted_data = convert_json_format(input_file, output_file)

# Print first few entries to verify format
print("\nSample of converted data:")
for entry in converted_data[:2]:
    print(json.dumps(entry, indent=2))

Converted 21 videos into 21 conversation entries

Sample of converted data:
{
  "id": "anonymized-79",
  "conversations": [
    {
      "from": "human",
      "value": "<image>\nThis is the surveillance camera footage from home delivery scenarios. I need you to analyze the content carefully and provide a structured analysis in the following format:\n\nscenario: identify the main events in the video, select the most relevant label in [Violent express delivery, Normal express delivery, Express package theft, Encounter with wildlife, Accidental dropping of delivery items, Other abnormal behaviors]\ndelivery service: identify the delivery company if visible, e.g., FedEx, UPS, Amazon, \u7f8e\u56e2\u5916\u5356, SAGAWA, DHL, or mark as \u201cUnknown\u201d if unclear\ndescription: provide a brief, one-sentence description of what happens in the video, including key actions and participants ** in English **\n\nReturn your analysis in strict JSON format as follows:\n{{\n    \"scenario\": \"main 