# Create the dataset from videos

In [None]:
import cv2
import os
import random
from pathlib import Path

In [None]:
def extract_random_frames(video_path, output_folder, num_frames=5):
    # Open the video file using OpenCV
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        print(f"Failed to open video: {video_path}")
        return

    # Get total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        print(f"Not enough frames in {video_path} to extract {num_frames} random frames.")
        cap.release()
        return

    # Generate a sorted list of random frame indices for reproducibility if needed
    random_frames = sorted(random.sample(range(total_frames), num_frames))
    frame_idx = 0
    saved_frames = 0

    # Process each frame in the video
    success, frame = cap.read()
    while success and saved_frames < num_frames:
        if frame_idx == random_frames[saved_frames]:
            # Create an output filename using the video name and frame index
            output_file = os.path.join(output_folder, f"{video_path.stem}_frame{frame_idx}.jpg")
            cv2.imwrite(output_file, frame)
            print(f"Saved frame {frame_idx} to {output_file}")
            saved_frames += 1
        frame_idx += 1
        success, frame = cap.read()
    
    cap.release()

In [None]:
# Process videos from given catalog
def process_videos(input_folder, output_folder, num_videos=36, num_frames=5):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    video_extensions = ('.mp4', '.avi', '.mov', '.mkv')
    
    # Iterate over all files in the input folder
    counter = 0
    for video_file in Path(input_folder).glob("*"):
        counter += 1
        if counter == num_videos:
            return
        if video_file.suffix.lower() in video_extensions:
            print(f"Processing video: {video_file}")
            extract_random_frames(video_file, output_folder, num_frames)

In [None]:
input_folder = Path("../datasets/DFL Bundesliga Data Shootout/train")
output_folder = Path("../datasets/data_player_detection-raw")
process_videos(input_folder, output_folder)
print(input_folder.exists()) 

In [None]:
for catalog in input_folder.glob("*"):
    if catalog.is_dir():
        process_videos(catalog, output_folder)

# Split labeled data from label studio

In [None]:
import supervision as sv
from pathlib import Path

dataset_base_path = Path('../../kagg/')

print(dataset_base_path.exists())

ds = sv.DetectionDataset.from_yolo(
    images_directory_path = dataset_base_path / 'images',
    annotations_directory_path = dataset_base_path/ 'labels',
    data_yaml_path = dataset_base_path / 'data.yaml'
)

print(ds.classes)

train_ds, test_ds = ds.split(split_ratio=0.7,
                             random_state=42, shuffle=True)

len(train_ds), len(test_ds)

In [None]:
import os
# Define base directory where final training data will be
base_dir = Path("../../datasets/player_detection/")
train_dir = base_dir / "train"
val_dir = base_dir / "valid"

# Save datasets
train_ds.as_yolo(images_directory_path=f"{train_dir}/images", annotations_directory_path=f"{train_dir}/labels")
test_ds.as_yolo(images_directory_path=f"{val_dir}/images", annotations_directory_path=f"{val_dir}/labels")

# Create data.yaml
data_yaml_path = os.path.join(base_dir, "data.yaml")
with open(data_yaml_path, "w") as f:
    f.write(f"""\
train: {os.path.abspath(train_dir)}/images
val: {os.path.abspath(val_dir)}/images

nc: {len(ds.classes)}
names: {ds.classes}
""")