In [1]:
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikitaorlov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import Levenshtein

def get_sid(reference_tokens, hypothesis_tokens):
    """
    Calculate Substitutions (S), Insertions (I), and Deletions (D)
    between reference and hypothesis token lists.
    """

    matcher = Levenshtein.editops(reference_tokens, hypothesis_tokens)
    
    S, I, D = 0, 0, 0
    for op in matcher:
        if op[0] == 'replace':
            S += 1
        elif op[0] == 'insert':
            I += 1
        elif op[0] == 'delete':
            D += 1
    
    return S, I, D

def prepare_table_for_wer(data_set_path):
    """
    Read the dataset and fill in NaN with empty string.
    Transform the sentences into tokens
    Fill in SID and token counts to calculate the error rate.
    Return: a DataFrame filled in SID and token counts to calculate the error rate.
    """

    data = pd.read_excel(data_set_path)
    data = data.fillna('')

    arr_original = data['Original']
    arr_transcribed = data['Sentence']

    tokens_original = [nltk.word_tokenize(sent) for sent in arr_original]
    tokens_transcribed = [nltk.word_tokenize(sent) for sent in arr_transcribed]

    for i, (orig, transc) in enumerate(zip(tokens_original, tokens_transcribed)):
        S, I, D = get_sid(orig, transc)
        data.loc[i, 'S'] = S
        data.loc[i, 'I'] = I
        data.loc[i, 'D'] = D

        data.loc[i, 'tokens_original'] = len(orig)
        data.loc[i, 'tokens_transcribed'] = len(transc)
    
    return data

def get_wer(dataset):
    """
    Calculate Word Error Rate (WER) between reference and hypothesis token lists.
    """

    tokens_original_total = sum(dataset['tokens_original'])
    total_errors = sum(dataset['S']) + sum(dataset['I']) + sum(dataset['D'])
    WER = total_errors / tokens_original_total

    return WER, total_errors, tokens_original_total


data_assembly = prepare_table_for_wer('assembly_WER.xlsx')
WER, total_errors, tokens_original_total = get_wer(data_assembly)
print(f'Total errors: {total_errors}')
print(f'Total tokens in original: {tokens_original_total}')
print(f'WER: {WER}')
data_assembly.to_excel('assembly_WER_SID.xlsx', index=False)

Total errors: 34
Total tokens in original: 1402.0
WER: 0.024251069900142655


In [3]:
data_assembly = prepare_table_for_wer('whisper_WER.xlsx')
WER, total_errors, tokens_original_total = get_wer(data_assembly)
print(f'Total errors: {total_errors}')
print(f'Total tokens in original: {tokens_original_total}')
print(f'WER: {WER}')
data_assembly.to_excel('whisper_WER_SID.xlsx', index=False)

Total errors: 40
Total tokens in original: 1231.0
WER: 0.03249390739236393
