# Statistics for WLASL

In [None]:
import json
from pathlib import Path
from typing import List, Dict, TypedDict, TypeAlias, Literal, Optional
import matplotlib.pyplot as plt
#locals
from configs import WLASL_ROOT, SPLIT_DIR, CLASSES_PATH
from stats import get_split_stats, latex_set_summary_table, latex_class_stats_table, barplot_metric, histogram_metric

### Splits

In [None]:
split_dir = Path(WLASL_ROOT) / SPLIT_DIR
splits = [p for p in split_dir.iterdir() if p.suffix == '.json']
print(f'Found splits: {[p.name for p in splits]}')

In [None]:
wlasl100 = json.load(open(split_dir / 'asl100.json'))
wlasl300 = json.load(open(split_dir / 'asl300.json'))
wlasl1000 = json.load(open(split_dir / 'asl1000.json'))
wlasl2000 = json.load(open(split_dir / 'asl2000.json'))

classes = json.load(open(CLASSES_PATH))

### Output

In [None]:
out_dir = Path('./info')
if out_dir.exists():
    print(list(out_dir.iterdir()))

## From the WLASL GitHUB page

Data Description
-----------------

* `gloss`: *str*, data file is structured/categorised based on sign gloss, or namely, labels.

* `bbox`: *[int]*, bounding box detected using YOLOv3 of (xmin, ymin, xmax, ymax) convention. Following OpenCV convention, (0, 0) is the up-left corner.

* `fps`: *int*, frame rate (=25) used to decode the video as in the paper.

* `frame_start`: *int*, the starting frame of the gloss in the video (decoding
with FPS=25), *indexed from 1*.

* `frame_end`: *int*, the ending frame of the gloss in the video (decoding with FPS=25). -1 indicates the gloss ends at the last frame of the video.

* `instance_id`: *int*, id of the instance in the same class/gloss.

* `signer_id`: *int*, id of the signer.

* `source`: *str*, a string identifier for the source site.

* `split`: *str*, indicates sample belongs to which subset.

* `url`: *str*, used for video downloading.

* `variation_id`: *int*, id for dialect (indexed from 0).

* `video_id`: *str*, a unique video identifier.


## Splits


In [None]:
top_n = 20

### WLASL-100

In [None]:
split_name = 'asl100'

In [None]:
out_file = out_dir / 'wlasl_100_stats.json'

In [None]:
wlasl100_stats = get_split_stats(wlasl100)
json.dump(wlasl100_stats, open(out_file, 'w'), indent=4)

#### Train

In [None]:
set_name = 'train'

In [None]:
print(latex_set_summary_table(set_name, wlasl100_stats))

In [None]:
print(latex_class_stats_table(split_name=split_name, set_name=set_name, set_stats_obj=wlasl100_stats["per_set_stats"][set_name]))

In [None]:
metric="num_instances"
barplot_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    top_n=top_n,
    title=f"Top {top_n} Glosses by {metric} ({split_name} {set_name})"
)
metric="num_signers"
barplot_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    top_n=top_n,
    title=f"Top {top_n} Glosses by {metric} ({split_name} {set_name})"
)

In [None]:
metric="num_instances"
histogram_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    bins=20,
    title=f"Distribution of Instances per Gloss ({split_name} {set_name})"
)
metric="num_signers"
histogram_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    bins=20,
    title=f"Distribution of Signers per Gloss ({split_name} {set_name})"
)

#### Val

In [None]:
set_name = 'val'

In [None]:
print(latex_set_summary_table(set_name, wlasl100_stats))

In [None]:
print(latex_class_stats_table(split_name=split_name, set_name=set_name, set_stats_obj=wlasl100_stats["per_set_stats"][set_name]))

In [None]:
metric="num_instances"
barplot_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    top_n=top_n,
    title=f"Top {top_n} Glosses by {metric} ({split_name} {set_name})"
)
metric="num_signers"
barplot_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    top_n=top_n,
    title=f"Top {top_n} Glosses by {metric} ({split_name} {set_name})"
)

In [None]:
metric="num_instances"
histogram_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    bins=20,
    title=f"Distribution of Instances per Gloss ({split_name} {set_name})"
)
metric="num_signers"
histogram_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    bins=20,
    title=f"Distribution of Signers per Gloss ({split_name} {set_name})"
)

#### Test

In [None]:
set_name = 'test'

In [None]:
print(latex_set_summary_table(set_name, wlasl100_stats))

In [None]:
print(latex_class_stats_table(split_name=split_name, set_name=set_name, set_stats_obj=wlasl100_stats["per_set_stats"][set_name]))

In [None]:
metric="num_instances"
barplot_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    top_n=top_n,
    title=f"Top {top_n} Glosses by {metric} ({split_name} {set_name})"
)
metric="num_signers"
barplot_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    top_n=top_n,
    title=f"Top {top_n} Glosses by {metric} ({split_name} {set_name})"
)

In [None]:
metric="num_instances"
histogram_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    bins=20,
    title=f"Distribution of Instances per Gloss ({split_name} {set_name})"
)
metric="num_signers"
histogram_metric(
    per_class=wlasl100_stats["per_set_stats"][set_name]["per_class_stats"],
    metric=metric,
    bins=20,
    title=f"Distribution of Signers per Gloss ({split_name} {set_name})"
)