# Code used to generate the data set

In [None]:
import numpy as np
import os
import math
import shutil

In [27]:
if not os.path.exists('videos'):
    shutil.unpack_archive('videos.zip', 'videos')


In [None]:
#parse metadata
with open("data_unprocessed/metadata.txt") as file:
    lines =  np.array([line.rstrip().split(";") for line in file])

Ideally we would use a python library to download the youtube videos here. Unfortunatly a recent change to Youtubes cypher system has rendered all of these tools non-functional. Since our data set has a small number of long videos, we have opted to download them manually.

Currently the raw videos are not included in the repo, later I will include them in a zip archive if the github file limit allows it

In [30]:
from moviepy.config import FFMPEG_BINARY
import subprocess

def ffmpeg_extract_subclip(
    inputfile, start_time, end_time, outputfile=None, logger="bar"
):
    """Makes a new video file playing video file between two times.

    Parameters
    ----------

    inputfile : str
      Path to the file from which the subclip will be extracted.

    start_time : float
      Moment of the input clip that marks the start of the produced subclip.

    end_time : float
      Moment of the input clip that marks the end of the produced subclip.

    outputfile : str, optional
      Path to the output file. Defaults to
      ``<inputfile_name>SUB<start_time>_<end_time><ext>``.
    """
    if not outputfile:
        name, ext = os.path.splitext(inputfile)
        t1, t2 = [int(1000 * t) for t in [start_time, end_time]]
        outputfile = "%sSUB%d_%d%s" % (name, t1, t2, ext)

    cmd = [
        FFMPEG_BINARY,
        "-y",
        "-ss",
        "%0.2f" % start_time,
        "-i",
        inputfile,
        "-to",
        "%0.2f" % end_time,
        "-map",
        "0",
        "-vcodec",
        "copy",
        "-acodec",
        "copy",
        "-copyts",
        outputfile,
    ]
    subprocess.run(cmd)

In [31]:
links = np.unique(lines.transpose()[4])

i=0
while i < len(lines):
    
    if float(lines[i][5]) == float(lines[i+1][5])  and lines[i][4] == lines[i+1][4]:
        sample = lines[i if lines[i][6]> lines[i+1][6]  else i+1]
        start = float(lines[i][5])
        end = max(float(lines[i][6]), float(lines[i+1][6]))
        
        name = lines[i][0]+"&"+lines[i+1][0]
        i+=2
    else:
        sample = lines[i]
        start = float(sample[5])
        end = float(sample[6])
        name = sample[0]
        i+=1

    idx = np.where(links == sample[4])
    video = sample[4].split("=")[-1]
    ## If its the first video, remove once all have been downloaded
    # if idx[0][0] == 0:
    ffmpeg_extract_subclip("videos/"+video+".mp4", start, end, "cropped/" + name+".mp4")
        
       



Create a zip archive of the output so the videos can be commited to github

In [32]:
if os.path.isfile("videos_processed.zip"):
    os.remove("videos_processed.zip")
shutil.make_archive("videos_processed", 'zip', "cropped")

'c:\\Users\\mikeG\\Documents\\school\\cisc-452\\CISC-452-Group-22\\videos_processed.zip'

In [None]:
# cropped\1_2018-07-12_1.mp4
# data_unprocessed\skeletons\11_2018-09-13_1.data
# run = "1_2018-09-13_1"

import cv2

for file_name in os.listdir('data_unprocessed/skeletons'):
    run = file_name.replace('.data', '')
    vidcap = cv2.VideoCapture('cropped\\'+run+'.mp4')
    success,image = vidcap.read()
    count = 1

    with open("data_unprocessed/skeletons/"+run+".data") as file:
        data =  np.array([line.rstrip().split("#") for line in file])

    frame_count =int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1

    # print(data)

    offset = frame_count - int(data[-1][0])

    for _ in range(offset):
        success,image = vidcap.read()


    for i in data:
        if i[1] == "NULL":
            continue
        success,image = vidcap.read()
        if not success:
            break
        cv2.imwrite("img/%s-%d.jpg" % (run, int(i[0])), image)     # save frame as JPEG file      
        count += 1

Number of frames:  242
Number of frames:  -1
Number of frames:  259
Number of frames:  175
Number of frames:  255
Number of frames:  173
Number of frames:  267
Number of frames:  268
Number of frames:  155
Number of frames:  -1
Number of frames:  253
Number of frames:  264
Number of frames:  270
Number of frames:  250
Number of frames:  158
Number of frames:  352
Number of frames:  -1
Number of frames:  466
Number of frames:  234
Number of frames:  -1
Number of frames:  216
Number of frames:  -1
Number of frames:  210
Number of frames:  15
Number of frames:  15
Number of frames:  15
Number of frames:  172
Number of frames:  189
Number of frames:  187
Number of frames:  248
Number of frames:  258
Number of frames:  -1
Number of frames:  213
Number of frames:  15
Number of frames:  15
Number of frames:  258
Number of frames:  303
Number of frames:  -1
Number of frames:  270
Number of frames:  263
Number of frames:  160
Number of frames:  185
Number of frames:  -1
Number of frames:  269
N

In [71]:
import json
import os
import random
import sys

def print_to_log(*args, sep=' ', end='\n', file=sys.stdout, flush=False):
    """
    Custom print function that appends the output to 'log.log' and writes to the console.

    Args:
        *args: Values to be printed.
        sep (str): Separator between values (default: ' ').
        end (str): End character (default: '\n').
        file: File-like object to write to (default: sys.stdout).
        flush (bool): Whether to forcibly flush the stream.
    """
    message = sep.join(map(str, args)) + end  # Construct the message

    # Append to the log file
    with open("log.log", "a") as log_file:
        log_file.write(message)


def parse_metadata(metadata_path):
    metadata = {}
    with open(metadata_path, 'r') as f:
        for line in f:
            parts = line.strip().split(';')
            metadata[parts[0]] = {
                'id_climber': parts[1],
                'date': parts[2],
                'run_number': parts[3],
                'url': parts[4],
                'start': float(parts[5]),
                'end': float(parts[6]),
                'time_sec': float(parts[7]),
                'time_frames': int(parts[8]),
                'finished': int(parts[9]),
                'side': parts[10],
                'fps': float(parts[11])
            }
    return metadata


def parse_skeletons(skeletons_dir):
    skeleton_data = {}
    for file_name in os.listdir(skeletons_dir):
        # Split the file name to extract metadata
        parts = file_name.replace('.data', '').split('_')
        if len(parts) != 3:
            print(f"Skipping invalid file name: {file_name}")
            continue
        
        participant_number = parts[0]
        date = parts[1]  # Format: year-month-day
        run_number = parts[2]
        run_id = f"{participant_number}_{date}_{run_number}"

        if run_id not in skeleton_data:
            skeleton_data[run_id] = {
                "participant_number": participant_number,
                "date": date,
                "run_number": run_number,
                "frames": []
            }

        # Parse skeleton data for each frame
        with open(os.path.join(skeletons_dir, file_name), 'r') as f:
            for line in f:
                if "NULL" in line:
                    continue
                parts = line.strip().split('#')
                frame_number = int(parts[0])
                keypoints = parts[1].split(';')
                # Parse coordinates and add visibility (2) for each joint
                keypoints = [float(coord) for kp in keypoints for coord in kp.split(',')]
                keypoints_with_visibility = []
                for i in range(0, len(keypoints), 2):
                    keypoints_with_visibility.extend([keypoints[i], keypoints[i + 1], 2])  # Add visibility
                skeleton_data[run_id]["frames"].append({
                    "frame_number": frame_number,
                    "keypoints": keypoints_with_visibility
                })
    return skeleton_data

def generate_coco_json(images_dir, skeletons, metadata, output_json):
    coco_data = {
        'images': [],
        'annotations': [],
        'categories': [{
            'id': 1,
            'name': 'person',
            'keypoints': [f'j{i+1}' for i in range(16)],
            'skeleton': [[i, i+1] for i in range(1, 16)]  # Simple linear skeleton
        }]
    }

    annotation_id = 1
    for run_id, run_data in skeletons.items():
        participant_number = run_data["participant_number"]
        date = run_data["date"]
        run_number = run_data["run_number"]
        run_metadata = metadata.get(run_id, {})

        for frame in run_data["frames"]:
            frame_number = frame["frame_number"]
            image_id = len(coco_data["images"]) + 1
            
            # Generate new image file name
            image_file = os.path.join(
                images_dir, f"{participant_number}_{date}_{run_number}-{frame_number}.jpg"
            )
            
            # Append image information
            coco_data["images"].append({
                "id": image_id,
                "file_name": image_file,
                "height": 1080,
                "width": 1920,
                "frame_number": frame_number,  # Track frame number
                "metadata": run_metadata  # Embed the full metadata
            })

            # Append annotation information
            coco_data["annotations"].append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": 1,
                "keypoints": frame["keypoints"],
                "num_keypoints": 16,
                "frame_number": frame_number,  # Track frame number
                "run_id": run_id,
                "metadata": run_metadata  # Embed the full metadata
            })
            annotation_id += 1

    # Save the final COCO JSON
    with open(output_json, 'w') as f:
        json.dump(coco_data, f, indent=4)


def split_data(metadata, skeletons, training_ratio=0.2):


    run_ids = list(metadata.keys())
    random.shuffle(run_ids)
    split_index = int(len(run_ids) * training_ratio)
    train_ids = run_ids[:split_index]
    test_ids = run_ids[split_index:]

    # Split metadata and skeletons
    train_metadata = {run_id: metadata[run_id] for run_id in train_ids}
    test_metadata = {run_id: metadata[run_id] for run_id in test_ids}
    train_skeletons = {run_id: skeletons[run_id] for run_id in train_ids}
    test_skeletons = {run_id: skeletons[run_id] for run_id in test_ids}

    return train_metadata, test_metadata, train_skeletons, test_skeletons

# Paths
images_dir = 'img'
skeletons_dir = 'data_unprocessed/skeletons'
metadata_file = 'data_unprocessed/metadata.txt'
training_output_json = 'training_climbData.json'
testing_output_json = 'testing_climbData.json'
# Process
metadata = parse_metadata(metadata_file)
skeletons = parse_skeletons(skeletons_dir)


train_metadata, test_metadata, train_skeletons, test_skeletons = split_data(metadata, skeletons, training_ratio=0.8)

# Generate COCO JSON for training and testing
generate_coco_json(images_dir, train_skeletons, train_metadata, training_output_json)
generate_coco_json(images_dir, test_skeletons, test_metadata, testing_output_json)
