Used the Car Accident Detection and Prediction (CADP) dataset from Carnegie Mellon University and the University of Tokyo.
From the manually annotated part of the dataset, we extracted the original CCTV videos.
Using the provided annotation file, we identified the time intervals where accidents occur.
For each video, we selected one frame per second within those accident intervals and saved them as accident images.
All extracted frames were saved into a structured folder for further model training and analysis.


In [4]:
%pip install opencv-python
!pip install scikit-learn

Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting numpy<2.3.0,>=2 (from opencv-python)
  Downloading numpy-2.2.6.tar.gz (20.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m22.0 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hDownloading opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl (37.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.9/37.9 MB[0m [31m27.1 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: numpy
  Building wheel for numpy (pyproject.toml) ... [?25ldone
[?25h  Created wheel for numpy: filename=numpy-2.2.6-cp314-cp314-macosx_15_0_arm64.whl size=5145560

In [5]:
import json
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import shutil

In [7]:
videos_dir = "raw_data/manual/videos"
annotations_path = "raw_data/manual/annotations.json"

video_files = {os.path.basename(f) for f in os.listdir(videos_dir) if f.endswith(".mp4")}

with open(annotations_path, "r") as f:
    annotations = json.load(f)
json_keys = set(annotations.keys())

intersection = video_files.intersection(json_keys)
missing_in_videos = json_keys - video_files
missing_in_json = video_files - json_keys

print(f"Total video files: {len(video_files)}")
print(f"Total entries in JSON: {len(json_keys)}")
print(f"Matched files: {len(intersection)}\n")


Total video files: 3
Total entries in JSON: 226
Matched files: 3



In [9]:
import os
import json
import cv2
from pathlib import Path
import math

videos_dir = Path("raw_data/manual/videos")
annotations_path = Path("raw_data/manual/annotations.json")
output_root = Path("dataset_accident_frames")

(output_root / "Accident").mkdir(parents=True, exist_ok=True)

with open(annotations_path, "r") as f:
    annotations = json.load(f)

for video_name in os.listdir(videos_dir):
    if not video_name.endswith(".mp4"):
        continue

    video_path = videos_dir / video_name
    ann = annotations.get(video_name, [])
    if not ann or not ann[0].get("keyframes"):
        continue  

    keyframes = ann[0]["keyframes"]
    starts = [k["frame"] for k in keyframes if k.get("state") == "Start"]
    ends = [k["frame"] for k in keyframes if k.get("state") == "End"]

    if not starts or not ends:
        continue  

    start_sec = float(starts[0])
    end_sec = float(ends[0])

    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    
    for sec in range(math.floor(start_sec), math.ceil(end_sec)):
        frame_idx = int(sec * fps)
        if frame_idx >= total_frames:
            break

        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        success, frame = cap.read()
        if success:
            out_name = f"{Path(video_name).stem}_{sec:04d}s.jpg"
            out_path = output_root / "Accident" / out_name
            cv2.imwrite(str(out_path), frame)

    cap.release()
