# Statistics for WLASL

In [72]:
import json
from configs import WLASL_ROOT, SPLIT_DIR, CLASSES_PATH
from pathlib import Path
from typing import List, Dict, TypedDict, TypeAlias, Literal

### Splits

In [73]:
split_dir = Path(WLASL_ROOT) / SPLIT_DIR
splits = [p for p in split_dir.iterdir() if p.suffix == '.json']
print(f'Found splits: {[p.name for p in splits]}')

Found splits: ['asl1000.json', 'all.json', 'asl100.json', 'asl300.json', 'asl2000.json']


In [74]:
wlasl100 = json.load(open(split_dir / 'asl100.json'))
wlasl300 = json.load(open(split_dir / 'asl300.json'))
wlasl1000 = json.load(open(split_dir / 'asl1000.json'))
wlasl2000 = json.load(open(split_dir / 'asl2000.json'))

classes = json.load(open(CLASSES_PATH))

### Output

In [75]:
out_dir = Path('./info')
if out_dir.exists():
    print(list(out_dir.iterdir()))

[PosixPath('info/wlasl_class_list.txt'), PosixPath('info/wlasl_class_list.json'), PosixPath('info/wlasl_100_stats.json')]


## From the WLASL GitHUB page

Data Description
-----------------

* `gloss`: *str*, data file is structured/categorised based on sign gloss, or namely, labels.

* `bbox`: *[int]*, bounding box detected using YOLOv3 of (xmin, ymin, xmax, ymax) convention. Following OpenCV convention, (0, 0) is the up-left corner.

* `fps`: *int*, frame rate (=25) used to decode the video as in the paper.

* `frame_start`: *int*, the starting frame of the gloss in the video (decoding
with FPS=25), *indexed from 1*.

* `frame_end`: *int*, the ending frame of the gloss in the video (decoding with FPS=25). -1 indicates the gloss ends at the last frame of the video.

* `instance_id`: *int*, id of the instance in the same class/gloss.

* `signer_id`: *int*, id of the signer.

* `source`: *str*, a string identifier for the source site.

* `split`: *str*, indicates sample belongs to which subset.

* `url`: *str*, used for video downloading.

* `variation_id`: *int*, id for dialect (indexed from 0).

* `video_id`: *str*, a unique video identifier.


## Statistical Methods


Naming convention (note this might be better the other way round): 
- set: one of train, test and val
- split one of asl100, asl300, asl1000, asl2000


In [76]:
AVAIL_SETS: TypeAlias = Literal['train', 'val', 'test']
AVAIL_SPLITS: TypeAlias = Literal['asl100', 'asl300', 'asl1000', 'asl2000']

class instance_dict(TypedDict):
    """Represents a single instance of a gloss in the dataset."""
    bbox: List[int]
    frame_end: int
    frame_start: int
    instance_id: int
    signer_id: int
    source: str
    split: str
    url: str
    variation_id: int
    video_id: str

class gloss_dict(TypedDict):
    """Represents a single gloss and its associated instances."""
    gloss: str
    instances: List[instance_dict]

class class_stats(TypedDict):
    """Represents statistics for a single gloss class."""
    num_instances: int
    num_signers: int
    num_variations: int

class set_stats(TypedDict):
    """Represents statistics for a data subset (train/val/test)."""
    num_instances: int
    num_signers: int
    per_class_stats: Dict[str, class_stats] #key is the class name
    

class split_stats(TypedDict):
    """Represents statistics for a single dataset split (asl100, asl300, asl1000, asl2000)."""
    num_classes: int
    num_instances: int
    num_signers: int
    per_set_stats: Dict[AVAIL_SETS, set_stats]

    

In [77]:
def get_class_stats(instances: List[instance_dict]) -> class_stats:
    """Collects statistics for a single class based on its instances."""
    num_instances = 0
    num_signers = 0
    num_variations = 0

    signer_ids = set()
    variation_ids = set()
    for instance in instances:
        num_instances += 1
        signer_ids.add(instance["signer_id"])
        variation_ids.add(instance["variation_id"])

    num_signers = len(signer_ids)
    num_variations = len(variation_ids)
    return {
        "num_instances": num_instances,
        "num_signers": num_signers,
        "num_variations": num_variations,
    }


def get_set(instances: List[instance_dict], set_name: AVAIL_SETS) -> List[instance_dict]:
    """Filters instances to only include those belonging to a specific set (train/val/test)."""
    filtered_instances = []
    for instance in instances:
        if instance["split"] == set_name:
            filtered_instances.append(instance)
    return filtered_instances

def seperate_by_set(glosses: List[gloss_dict]) -> Dict[AVAIL_SETS, List[gloss_dict]]:
    """Separates glosses by their set (train/val/test)."""
    sets = {
        "train": [],
        "val": [],
        "test": []
    }
    for gloss in glosses:
        instances = gloss["instances"]
        for set_name in sets.keys():
            filtered_instances = get_set(instances, set_name)
            
            sets[set_name].append({
                "gloss": gloss["gloss"],
                "instances": filtered_instances
            })
    return sets


def get_per_class_stats(glosses: List[gloss_dict]) -> Dict[str, class_stats]:
    """Collects statistics for all classes in a list of gloss_dicts (recommend seperating into test/val/train first)"""
    per_class_stats = {}
    for gloss in glosses:
        class_name = gloss["gloss"]
        instances = gloss["instances"]
        per_class_stats[class_name] = get_class_stats(instances)
    return per_class_stats


def get_unique_signers(dataset: List[gloss_dict]) -> set[int]:
    """Get set of unique sighners in a list of gloss_dicts"""
    signers = set()
    for gloss_d in dataset:
        for instance_dict in gloss_d["instances"]:
            signers.add(instance_dict["signer_id"])
    return signers

def get_num_instances(dataset: List[gloss_dict]) -> int:
    """Get the number of instances in a dataset"""
    
    num_instances = 0
    for gloss_d in dataset:
        num_instances += len(gloss_d["instances"])
    return num_instances


def get_set_stats(subset: List[gloss_dict]) -> set_stats:
    """Get stats for a particular set (one of test/val.train, seperate first)"""
    return set_stats(
        num_instances=get_num_instances(subset),
        num_signers=len(get_unique_signers(subset)),
        per_class_stats=get_per_class_stats(subset),
    )



def get_per_set_stats(glosses: List[gloss_dict]) -> Dict[AVAIL_SETS, set_stats]:
    """Seperates into sets, then returns stats per set"""
    sets = seperate_by_set(glosses)
    per_set_stats = {}
    for set_name, glosses_subset in sets.items():
        per_set_stats[set_name] = get_set_stats(glosses_subset)
    return per_set_stats


def get_split_stats(split: List[gloss_dict]) -> split_stats:
    return split_stats(
        num_classes=len(split),
        num_instances=get_num_instances(split),
        num_signers=len(get_unique_signers(split)),
        per_set_stats=get_per_set_stats(split)
    )


## Output methods

## Splits

### WLASL-100


In [78]:
out_file = out_dir / 'wlasl_100_stats.json'

In [88]:
json.dump(get_split_stats(wlasl100), open(out_file, 'w'), indent=4)
print(json.dumps(get_split_stats(wlasl100), indent=4))

{
    "num_classes": 100,
    "num_instances": 2038,
    "num_signers": 97,
    "per_set_stats": {
        "train": {
            "num_instances": 1442,
            "num_signers": 91,
            "per_class_stats": {
                "book": {
                    "num_instances": 30,
                    "num_signers": 14,
                    "num_variations": 1
                },
                "drink": {
                    "num_instances": 25,
                    "num_signers": 15,
                    "num_variations": 1
                },
                "computer": {
                    "num_instances": 20,
                    "num_signers": 12,
                    "num_variations": 3
                },
                "before": {
                    "num_instances": 18,
                    "num_signers": 13,
                    "num_variations": 2
                },
                "chair": {
                    "num_instances": 19,
                    "num_signers": 11,
         