# WER,RIL, MER, WIL
https://www.researchgate.net/publication/221478089_From_WER_and_RIL_to_MER_and_WIL_improved_evaluation_measures_for_connected_speech_recognition


In [1]:
# https://github.com/jitsi/jiwer
import jiwer 
from pathlib import Path

In [11]:
#Example
ground_truth = "hello world"
hypothesis = "hello duck"

wer = jiwer.wer(ground_truth, hypothesis)
mer = jiwer.mer(ground_truth, hypothesis)
wil = jiwer.wil(ground_truth, hypothesis)
cer = jiwer.cer(ground_truth, hypothesis)

# faster, because `compute_measures` only needs to perform the heavy lifting once:
measures = jiwer.compute_measures(ground_truth, hypothesis)
wer = measures['wer']
mer = measures['mer']
wil = measures['wil']
# cer = measures['cer'] # cer is not included in jiwer.compute_measures :(
print(wer, mer, wil, cer)
print()

#Another example
two_sentence_truth = ["i can spell", "i hope"]
two_sentence_hypothesis = ["i kan cpell", "i hop"]
wer = jiwer.wer(two_sentence_truth, two_sentence_hypothesis)
cer = jiwer.cer(two_sentence_truth, two_sentence_hypothesis)
print("Word Error Rate", wer)  # so 0.6 would mean 60% of words were wrong
print("Characters Error Rate", cer) # so 17% of all characters were wrong,missing or duplicates)

0.5 0.5 0.75 0.45454545454545453

Word Error Rate 0.6
Characters Error Rate 0.17647058823529413


In [11]:
# Here we are adding files in "slovo" variable using "pathlib" library
# Then we sort the list by file name
slovo = [f for f in Path("./txt/Slovo").glob("*.txt")]
slovo = sorted(slovo, key=lambda elem: elem.name)
slovo

[WindowsPath('txt/Slovo/slov1941n1_004.txt'),
 WindowsPath('txt/Slovo/slov1941n2_004.txt'),
 WindowsPath('txt/Slovo/slov1941n3_004.txt'),
 WindowsPath('txt/Slovo/slov1941n4_003.txt'),
 WindowsPath('txt/Slovo/slov1941n4_004.txt')]

In [14]:
slovo_clean  = [f for f in Path("./txt/Slovo..._clean").glob("*.txt")]
slovo_clean = sorted(slovo_clean, key=lambda elem: elem.name)
slovo_clean

[WindowsPath('txt/Slovo..._clean/slov1941n1_004_clean.txt'),
 WindowsPath('txt/Slovo..._clean/slov1941n2_004_clean.txt'),
 WindowsPath('txt/Slovo..._clean/slov1941n3_003_clean.txt'),
 WindowsPath('txt/Slovo..._clean/slov1941n4_003_clean.txt'),
 WindowsPath('txt/Slovo..._clean/slov1941n4_004_clean.txt')]

In [15]:
# Here we concatinate all pages together (because we need to get the general idea on how well OCR works about the journal not pages)
# And we split text by lines
s_text = ""
for f in slovo:
    s_text += f.read_text(encoding="utf-8")+"\n"
s_lines = s_text.split("\n")[:-1]
len(s_lines)

2496

In [16]:
s_clean_text = ""
for f in slovo_clean:
    s_clean_text += f.read_text(encoding="utf-8")+"\n"
s_clean_lines = s_clean_text.split("\n")[:-1]
len(s_clean_lines)

2482

In [27]:
# We remove empty spaces with split() and exclude empty lines  
s_lines = [line for line in s_lines if len(line.strip()) > 0]
len(s_lines)

2274

In [28]:
s_clean_lines = [line for line in s_clean_lines if len(line.strip()) > 0]
len(s_clean_lines)

2271

In [58]:
# The number of lines of s_lines and s_clean_lines must be the same for this evaluation
# Because in this case OCR incorrectly recognised lines will not be taken into account
# Therefore we will save concatinated text without empty lines in two new files, which I will manually edit and upload here
# I will replace missing lines with something (in this case ".")

s_text = "\n".join(s_lines)
s_clean_text = "\n".join(s_clean_lines)

with open('./txt/slovo_text_must_replace_missing_lines.txt', 'w', encoding='utf-8') as f:
    f.write(s_text)
with open('./txt/slovo_clean_text_must_replace_missing_lines.txt', 'w', encoding='utf-8') as f:
    f.write(s_clean_text)

In [3]:
new_files = [f for f in Path("./").glob("*.txt")]
[print(f) for f in new_files]
print()

with open(new_files[-1], encoding="utf-8") as f:
    s_lines = f.readlines()
print(len(s_lines))

with open(new_files[-2], encoding="utf-8") as f:
    s_clean_lines = f.readlines()
print(len(s_clean_lines))

daugava_clean_with_matching_lines..txt
daugava_with_matching_lines..txt
slovo_clean_with_matching_lines..txt
slovo_with_matching_lines.txt

2272
2272


In [10]:
# now we are ready to compare truth and original
s_wer = jiwer.wer(s_clean_lines, s_lines)
s_cer = jiwer.cer(s_clean_lines, s_lines)
s_wil = jiwer.wil(s_clean_lines, s_lines)
s_mer = jiwer.mer(s_clean_lines, s_lines)

print("Word Error Rate", s_wer)  # so 0.19 would mean 19% of words were wrong
print("Characters Error Rate", s_cer) # so 4% of all characters were wrong,missing or duplicates)
print("Word Information Lost", s_wil)
print("Match Error Rate", s_mer)

Word Error Rate 0.18634354019977076
Characters Error Rate 0.044660047566914565
Word Information Lost 0.3331552163919098
Match Error Rate 0.18565951545803083


In [None]:
# ↓ DAUGAVA ↓

In [2]:
daugava = [f for f in Path("./txt/Daugava").glob("*.txt")]
daugava = sorted(daugava, key=lambda elem: elem.name)
daugava

[WindowsPath('txt/Daugava/dauh1989n001_072.txt'),
 WindowsPath('txt/Daugava/dauh1989n009_089.txt'),
 WindowsPath('txt/Daugava/dauh1990n001_072.txt'),
 WindowsPath('txt/Daugava/dauh1990n010_094.txt'),
 WindowsPath('txt/Daugava/dauh1994n007-008_143.txt')]

In [3]:
daugava_clean  = [f for f in Path("txt/Daugava..._clean").glob("*.txt")]
daugava_clean = sorted(daugava_clean, key=lambda elem: elem.name)
daugava_clean

[WindowsPath('txt/Daugava..._clean/dauh1989n001_072_clean.txt'),
 WindowsPath('txt/Daugava..._clean/dauh1989n009_089_clean.txt'),
 WindowsPath('txt/Daugava..._clean/dauh1990n001_072_clean.txt'),
 WindowsPath('txt/Daugava..._clean/dauh1990n010_094_clean.txt'),
 WindowsPath('txt/Daugava..._clean/dauh1994n007-008_143_clean.txt')]

In [4]:
d_text = ""
for f in daugava:
    d_text += f.read_text(encoding="utf-8")+"\n"
d_lines = d_text.split("\n")[:-1]
len(d_lines)

465

In [5]:
d_clean_text = ""
for f in daugava_clean:
    d_clean_text += f.read_text(encoding="utf-8")+"\n"
d_clean_lines = d_clean_text.split("\n")[:-1]
len(d_clean_lines)

452

In [6]:
d_lines = [line for line in d_lines if len(line.strip()) > 0]
len(d_lines)

422

In [7]:
d_clean_lines = [line for line in d_clean_lines if len(line.strip()) > 0]
len(d_clean_lines)

419

In [59]:
d_text = "\n".join(d_lines)
d_clean_text = "\n".join(d_clean_lines)

with open('./txt/daugava_text_must_replace_missing_lines.txt', 'w', encoding='utf-8') as f:
    f.write(d_text)
with open('./txt/daugava_clean_text_must_replace_missing_lines.txt', 'w', encoding='utf-8') as f:
    f.write(d_clean_text)

In [12]:
new_files = [f for f in Path("txt").glob("*.txt")]
[print(f) for f in new_files]
print()

with open(new_files[1], encoding="utf-8") as f:
    d_lines = f.readlines()
print(len(d_lines))

with open(new_files[0], encoding="utf-8") as f:
    d_clean_lines = f.readlines()
print(len(d_clean_lines))

txt\daugava_clean_text.txt
txt\daugava_text.txt
txt\slovo_clean_text.txt
txt\slovo_text.txt

420
420


In [13]:
d_wer = jiwer.wer(d_clean_lines, d_lines)
d_cer = jiwer.cer(d_clean_lines, d_lines)
d_wil = jiwer.wil(d_clean_lines, d_lines)
d_mer = jiwer.mer(d_clean_lines, d_lines)

print("Word Error Rate", d_wer)  # so 0.03 would mean 3% of words were wrong
print("Characters Error Rate", d_cer) # so 1% of all characters were wrong,missing or duplicates)
print("Word Information Lost", d_wil)
print("Match Error Rate", d_mer)

Word Error Rate 0.031141868512110725
Characters Error Rate 0.010582799224921747
Word Information Lost 0.042355637894460396
Match Error Rate 0.03111111111111111


In [14]:
# Conclusion
print("\t\t\t Slovo... \t Daugava...")
print("Word Error Rate\t\t", round(s_wer, 5), "\t", round(d_wer, 5))
print("Characters Error Rate\t", round(s_cer, 5), "\t", round(d_cer, 5))
print("Word Information Lost\t", round(s_wil, 5), "\t", round(d_wil, 5))
print("Match Error Rate\t", round(s_mer, 5), "\t", round(d_mer, 5))

			 Slovo... 	 Daugava...
Word Error Rate		 0.18634 	 0.03114
Characters Error Rate	 0.04466 	 0.01058
Word Information Lost	 0.33316 	 0.04236
Match Error Rate	 0.18566 	 0.03111
