In [1]:
from typing import List, Set, Tuple
import pycantonese

def get_segment_intervals(segments: List[str]) -> Set[Tuple[int, int]]:
    """
    >>> get_segment_intervals(['廣東話', '容', '唔', '容易', '學', '？']) == {(0, 3), (3, 4), (4, 5), (5, 7), (7, 8), (8, 9)}
    True
    """
    intervals = set()
    segment_start = 0
    for segment in segments:
        segment_end = segment_start + len(segment)
        intervals.add((segment_start, segment_end))
        segment_start = segment_end
    return intervals

def benchmark(utterances: List[List[str]]):
    num_correct = 0
    total_recall = 0
    total_true_positives = 0
    total_real_positives = 0
    incorrect_segments = []
    for reference_segments in utterances:
        segments = pycantonese.segment("".join(reference_segments))
        if segments == reference_segments:
            num_correct += 1
        else:
            incorrect_segments.append((reference_segments, segments))
        reference_intervals = get_segment_intervals(reference_segments)
        intervals = get_segment_intervals(segments)
        true_positives = len(reference_intervals.intersection(intervals))
        real_positives = len(reference_intervals)
        total_true_positives += true_positives
        total_real_positives += real_positives
        total_recall += true_positives / real_positives

    macro_recall = total_recall / len(utterances)
    micro_recall = total_true_positives / total_real_positives
    print(f"% correct: {round(num_correct / len(utterances) * 100, 1)}")
    print(f"macro recall: {round(macro_recall, 3)}")
    print(f"micro recall: {round(micro_recall, 3)}")
    # print("Incorrect examples")
    # for (reference_segment, segment) in incorrect_segments[:100]:
    #     print("reference:", reference_segment)
    #     print("incorrect:", segment)
    #     print()

import doctest
doctest.testmod()


hkcancor = pycantonese.hkcancor()
print("hkcancor")
benchmark(hkcancor.words(by_utterances = True))

print("lee wong leung")
lee_wong_leung = pycantonese.read_chat("https://childes.talkbank.org/data/Chinese/Cantonese/LeeWongLeung.zip")
benchmark(lee_wong_leung.words(by_utterances = True))

print("yip-matthews bilingual")
yip_matthews = pycantonese.read_chat("https://childes.talkbank.org/data/Biling/YipMatthews.zip")
benchmark(yip_matthews.words(by_utterances = True))

hkcancor
% correct: 60.2
macro recall: 0.893
micro recall: 0.871
lee wong leung
% correct: 56.8
macro recall: 0.817
micro recall: 0.788
yip-matthews bilingual
% correct: 44.4
macro recall: 0.675
micro recall: 0.6
