In [1]:
import pandas as np

In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm

def extract_frames(video_path, output_dir, num_frames=16, size=(224, 224)):
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total == 0:
        print(f"⚠️ Skipping empty video: {video_path}")
        return
    
    indices = np.linspace(0, total - 1, num=num_frames, dtype=int)
    
    for count, idx in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.resize(frame, size)
            save_path = os.path.join(output_dir, f"frame_{count:03d}.jpg")
            cv2.imwrite(save_path, frame)
    cap.release()

def process_dataset(input_root, output_root, num_frames=16):
    classes = ["store_theft", "no_theft"]
    for cls in classes:
        input_dir = os.path.join(input_root, cls)
        output_dir = os.path.join(output_root, cls)
        os.makedirs(output_dir, exist_ok=True)

        videos = [f for f in os.listdir(input_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
        for v in tqdm(videos, desc=f"Processing {cls}"):
            video_path = os.path.join(input_dir, v)
            video_name = os.path.splitext(v)[0]
            video_out_dir = os.path.join(output_dir, video_name)
            extract_frames(video_path, video_out_dir, num_frames=num_frames)

# Example usage
process_dataset(
    input_root="dataset",
    output_root="processed_data",
    num_frames=16
)


In [None]:
import shutil
import random
import os

def split_dataset(base_dir="processed_data", output_root="split_data", train_ratio=0.7, val_ratio=0.15):
    classes = ["store_theft", "no_theft"]
    for cls in classes:
        input_dir = os.path.join(base_dir, cls)
        videos = os.listdir(input_dir)
        random.shuffle(videos)
        
        n = len(videos)
        n_train = int(n * train_ratio)
        n_val = int(n * val_ratio)

        splits = {
            "train": videos[:n_train],
            "val": videos[n_train:n_train+n_val],
            "test": videos[n_train+n_val:]
        }

        for split_name, vids in splits.items():
            for v in vids:
                src = os.path.join(input_dir, v)
                dst = os.path.join(output_root, split_name, cls, v)
                shutil.copytree(src, dst, dirs_exist_ok=True)

split_dataset("processed_data", "split_data")
