In [7]:
import pandas as pd
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\artjo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [10]:
import Levenshtein

def get_sid(reference_tokens, hypothesis_tokens):
    """
    Calculate Substitutions (S), Insertions (I), and Deletions (D)
    between reference and hypothesis token lists.
    """

    matcher = Levenshtein.editops(reference_tokens, hypothesis_tokens)
    
    S, I, D = 0, 0, 0
    for op in matcher:
        if op[0] == 'replace':
            S += 1
        elif op[0] == 'insert':
            I += 1
        elif op[0] == 'delete':
            D += 1
    
    return S, I, D

def prepare_table_for_wer(data_set_path):
    """
    Read the dataset and fill in NaN with empty string.
    Transform the sentences into tokens
    Fill in SID and token counts to calculate the error rate.
    Return: a DataFrame filled in SID and token counts to calculate the error rate.
    """

    data = pd.read_excel(data_set_path)
    data = data.fillna('')

    arr_original = data['Original']
    arr_transcribed = data['Sentence']

    tokens_original = [nltk.word_tokenize(sent) for sent in arr_original]
    tokens_transcribed = [nltk.word_tokenize(sent) for sent in arr_transcribed]

    for i, (orig, transc) in enumerate(zip(tokens_original, tokens_transcribed)):
        S, I, D = get_sid(orig, transc)
        data.loc[i, 'S'] = S
        data.loc[i, 'I'] = I
        data.loc[i, 'D'] = D

        data.loc[i, 'tokens_original'] = len(orig)
        data.loc[i, 'tokens_transcribed'] = len(transc)
    
    return data

def get_wer(dataset):
    """
    Calculate Word Error Rate (WER) between reference and hypothesis token lists.
    """

    tokens_original_total = sum(dataset['tokens_original'])
    total_errors = sum(dataset['S']) + sum(dataset['I']) + sum(dataset['D'])
    WER = total_errors / tokens_original_total

    return WER, total_errors, tokens_original_total


data_assembly = prepare_table_for_wer('assembly_WER_50-100_with_original.xlsx')
WER, total_errors, tokens_original_total = get_wer(data_assembly)
print(f'Total errors: {total_errors}')
print(f'Total tokens in original: {tokens_original_total}')
print(f'WER: {WER}')
data_assembly.to_excel('assembly_WER_50-100_with_original_SID.xlsx', index=False)

Total errors: 5
Total tokens in original: 419.0
WER: 0.011933174224343675


In [11]:
data_assembly = prepare_table_for_wer('whisper_WER_50-100_with_original.xlsx')
WER, total_errors, tokens_original_total = get_wer(data_assembly)
print(f'Total errors: {total_errors}')
print(f'Total tokens in original: {tokens_original_total}')
print(f'WER: {WER}')
data_assembly.to_excel('whisper_WER_50-100_with_original_SID.xlsx', index=False)

Total errors: 11
Total tokens in original: 414.0
WER: 0.026570048309178744
