In [15]:
from typing import List, Set, Tuple
import pycantonese
hkcancor = pycantonese.hkcancor()
utterances = hkcancor.words(by_utterances = True)
num_correct = 0
total_recall = 0
total_true_positives = 0
total_real_positives = 0
incorrect_segments = []

def get_segment_intervals(segments: List[str]) -> Set[Tuple[int, int]]:
    """
    >>> get_segment_intervals(['廣東話', '容', '唔', '容易', '學', '？']) == {(0, 3), (3, 4), (4, 5), (5, 7), (7, 8), (8, 9)}
    True
    """
    intervals = set()
    segment_start = 0
    for segment in segments:
        segment_end = segment_start + len(segment)
        intervals.add((segment_start, segment_end))
        segment_start = segment_end
    return intervals

for reference_segments in utterances:
    segments = pycantonese.segment("".join(reference_segments))
    if segments == reference_segments:
        num_correct += 1
    else:
        incorrect_segments.append((reference_segments, segments))
    reference_intervals = get_segment_intervals(reference_segments)
    intervals = get_segment_intervals(segments)
    true_positives = len(reference_intervals.intersection(intervals))
    real_positives = len(reference_intervals)
    total_true_positives += true_positives
    total_real_positives += real_positives
    total_recall += true_positives / real_positives

import doctest
doctest.testmod()

macro_recall = total_recall / len(utterances)
micro_recall = total_true_positives / total_real_positives
print(f"% correct: {round(num_correct / len(utterances) * 100, 1)}")
print(f"macro recall: {round(macro_recall, 3)}")
print(f"micro recall: {round(micro_recall, 3)}")
print("Incorrect examples")
for (reference_segment, segment) in incorrect_segments[:100]:
    print("reference:", reference_segment)
    print("incorrect:", segment)
    print()

% correct: 60.2
macro recall: 0.893
micro recall: 0.871
Incorrect examples
reference: ['喂', '遲', '啲', '去', '唔', '去', '旅行', '啊', '?']
incorrect: ['喂', '遲啲', '去', '唔', '去', '旅行', '啊', '?']

reference: ['平', '機票', '要', '淡季', '先', '有得', '平', '𡃉', '喎', '.']
incorrect: ['平', '機票', '要', '淡季', '先有', '得', '平', '𡃉', '喎', '.']

reference: ['我', '聽', '朋友', '講', '話', '去', ',', '誒', ',', 'Orlando', '嗰個', '舊', '-', '嗰個', '迪士尼', '呢', ',', '廿五', '週年', '喎', '.']
incorrect: ['我', '聽', '朋友', '講話', '去', ',', '誒', ',', 'Orlando', '嗰個', '舊', '-', '嗰個', '迪士尼', '呢', ',', '廿五', '週年', '喎', '.']

reference: ['High', 'season', '去', '𡃉', '喎', '.']
incorrect: ['Highseason', '去', '𡃉', '喎', '.']

reference: ['都', '要', '成', '萬四', '蚊', ',', '四', '五', '日', '.']
incorrect: ['都', '要成', '萬四', '蚊', ',', '四', '五日', '.']

reference: ['淨係', '去', 'Orlando', '嗰度', '玩', '咋', '喎', '.']
incorrect: ['淨係', '去', 'Orlando', '嗰度', '玩', '咋喎', '.']

reference: ['冇', '嘢', '做', '.']
incorrect: ['冇嘢', '做', '.']

reference: ['噉', '咪', '食', '同'