# Part1: Data handling

### Data splitting

In [2]:
import json
import os
import shutil

def organize_data(json_file, videos_folder, output_folder):
    """Organizes video data into training, validation, and testing folders based on JSON file."""

    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: JSON file '{json_file}' not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{json_file}'.")
        return

    os.makedirs(os.path.join(output_folder, "training_data"), exist_ok=True)
    os.makedirs(os.path.join(output_folder, "validation_data"), exist_ok=True)
    os.makedirs(os.path.join(output_folder, "testing_data"), exist_ok=True)

    for entry in data:
        for instance in entry['instances']:
            video_id = instance['video_id']
            video_path = os.path.join(videos_folder, f"{video_id}.mp4")
            split = instance['split']

            if not os.path.exists(video_path):
                print(f"Warning: Video file '{video_path}' not found. Skipping.")
                continue

            if split == "train":
                destination = os.path.join(output_folder, "training_data", f"{video_id}.mp4")
            elif split == "val":
                destination = os.path.join(output_folder, "validation_data", f"{video_id}.mp4")
            elif split == "test":
                destination = os.path.join(output_folder, "testing_data", f"{video_id}.mp4")
            else:
                print(f"Warning: Unknown split '{split}' for video '{video_id}'. Skipping.")
                continue

            try:
                shutil.copy2(video_path, destination)  # copy2 preserves metadata
                print(f"Copied '{video_id}.mp4' to '{split}' folder.")
            except shutil.Error as e:
                print(f"Error copying '{video_id}.mp4': {e}")

def main():
    json_file_path = 'WLASL_100.json'  # Path to your JSON file
    videos_folder_path = 'videos'  # Path to your videos folder
    output_directory = 'organized_data'  # Path to the output directory

    organize_data(json_file_path, videos_folder_path, output_directory)

if __name__ == "__main__":
    main()

Copied '69241.mp4' to 'train' folder.
Copied '07069.mp4' to 'train' folder.
Copied '07068.mp4' to 'train' folder.
Copied '07070.mp4' to 'train' folder.
Copied '07074.mp4' to 'train' folder.
Copied '69302.mp4' to 'val' folder.
Copied '17710.mp4' to 'train' folder.
Copied '65540.mp4' to 'train' folder.
Copied '17711.mp4' to 'train' folder.
Copied '17712.mp4' to 'train' folder.
Copied '17713.mp4' to 'test' folder.
Copied '17709.mp4' to 'train' folder.
Copied '17720.mp4' to 'train' folder.
Copied '17721.mp4' to 'train' folder.
Copied '17722.mp4' to 'train' folder.
Copied '17723.mp4' to 'train' folder.
Copied '17724.mp4' to 'val' folder.
Copied '12328.mp4' to 'train' folder.
Copied '12312.mp4' to 'val' folder.
Copied '12311.mp4' to 'train' folder.
Copied '12313.mp4' to 'train' folder.
Copied '12314.mp4' to 'train' folder.
Copied '12315.mp4' to 'val' folder.
Copied '12316.mp4' to 'train' folder.
Copied '12317.mp4' to 'train' folder.
Copied '12318.mp4' to 'train' folder.
Copied '12319.mp4' to

### Data processing

In [6]:
def load_labels(json_file):
    """Loads video labels, creating a dictionary with lists of video IDs per gloss."""
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: JSON file '{json_file}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{json_file}'.")
        return None

    label_data = {}
    for item in data:
        gloss = item["gloss"]
        for instance in item["instances"]:
            video_id = instance["video_id"]
            if gloss not in label_data:
                label_data[gloss] = []
            label_data[gloss].append(video_id)
    return label_data

In [7]:
def rename_videos(data_dir, label_data):
    """Renames videos, adding a unique counter for each instance of a gloss."""
    for subfolder in ["training_data", "validation_data", "testing_data"]:
        subfolder_path = os.path.join(data_dir, subfolder)
        for gloss, video_ids in label_data.items():
            for i, video_id in enumerate(video_ids):
                filename = f"{video_id}.mp4"
                old_path = os.path.join(subfolder_path, filename)
                if os.path.exists(old_path):  # Check if the file exists before renaming
                    new_filename = f"{gloss}_{video_id}_{i+1}.mp4"  # Add instance counter
                    new_path = os.path.join(subfolder_path, new_filename)
                    os.rename(old_path, new_path)
                    print(f"Renamed: {old_path} -> {new_path}")
                else:
                    print(f"Warning: Video file '{filename}' not found in '{subfolder_path}'.")

In [8]:
json_file_path = 'WLASL_100.json'
data_dir = 'organized_data'

label_data = load_labels(json_file_path)
if label_data is not None:
    rename_videos(data_dir, label_data)

Renamed: organized_data\training_data\69241.mp4 -> organized_data\training_data\book_69241_1.mp4
Renamed: organized_data\training_data\07069.mp4 -> organized_data\training_data\book_07069_11.mp4
Renamed: organized_data\training_data\07068.mp4 -> organized_data\training_data\book_07068_18.mp4
Renamed: organized_data\training_data\07070.mp4 -> organized_data\training_data\book_07070_23.mp4
Renamed: organized_data\training_data\07074.mp4 -> organized_data\training_data\book_07074_30.mp4
Renamed: organized_data\training_data\17710.mp4 -> organized_data\training_data\drink_17710_12.mp4
Renamed: organized_data\training_data\65540.mp4 -> organized_data\training_data\drink_65540_18.mp4
Renamed: organized_data\training_data\17711.mp4 -> organized_data\training_data\drink_17711_20.mp4
Renamed: organized_data\training_data\17712.mp4 -> organized_data\training_data\drink_17712_21.mp4
Renamed: organized_data\training_data\17709.mp4 -> organized_data\training_data\drink_17709_28.mp4
Renamed: organiz