In [112]:
import pandas as pd
import jiwer
import re
import os

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Used to convert numbers to words, e.g. 8 o'clock -> eight o'clock, 1944 -> nineteen forty four
import inflect
# Inflect is more flexible, but doesn't create ordinals as words - use num2words for that
from num2words import num2words

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tompickard/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
%cd '/home/tompickard/MiniProject/Legasee-Oral-History/'

from measures import compute_measures

/home/tompickard/MiniProject/Legasee-Oral-History


In [3]:
%cd '/home/tompickard/MiniProject/Legasee-Oral-History/evaluation'

/home/tompickard/MiniProject/Legasee-Oral-History/evaluation


In [4]:
patt_newvid = re.compile(r"\-+\s+NEW VIDEO\s+\-+")

In [5]:
sys_ts = []

sys_text = ''

with open('../system_outputs/baselines/wav2vec/harriet_20211019.txt','r') as sysin:
    for l in sysin.readlines():
        
        if re.fullmatch(patt_newvid,l.strip()):
            sys_ts.append(sys_text.strip())
            sys_text = ''
            
        else:
            sys_text = " ".join([sys_text,l.strip()])
        
    if len(sys_text.strip()):
        sys_ts.append(sys_text.strip())
sys_ts

["harriet edith prosser was when i joined the friends that was my name harriet where are you from and why did you decide you want it to be a red well i have had other people ask me that and i think myself because i was open haired a pale skinned and freckle faced i didn't think that the e t s would suit me the uniform you know and i thought that the navy blue wood and i think that's the only reason because we didn't live by the sea we were lived in land you know farming stock where are you from aberdeenshire so you say what what what impact did the war have on you as a young girl living in abiding well am i was still at school of course i went to inverurie academy and i was still at the academy and i remember am there used to be a reconisence plan come over every night you could almost set your watch by it eight clockish and you i knew we all knew that the engine had a different noise to it you know so you could tell that it was a german plane and then one night it was much more noisy 

In [6]:
human_df = pd.read_csv('../transcripts/ingested/Harriet Wright.tsv', delimiter='\t', index_col=0)

In [7]:
hum_ts = []

hum_text = ''

for l in human_df.Transcript:
    if l.strip() == "New Film":
        if len(hum_text.strip()):
            hum_ts.append(hum_text.strip())
        hum_text = ''
            
    else:
        hum_text = " ".join([hum_text,l.strip()])
        
if len(hum_text.strip()):
    hum_ts.append(hum_text.strip())
            
hum_ts

["Harriet Edith Frosser - was, when I joined the wrens- that was my name. Harriet where-where are you from and why did you decide you wanted to be a WREN? oh, well I've had other people ask me that and i think myself because I was auburn haired, um, pale skinned and freckly faced, I didn't think that um the ATS would suit me- the uniform you know (laughs). And I though that the navy blue would. and I think that's the only reason. Really? Because we didn't live by the sea, we were- lived inland you know. Farming stock And where are you from? Aberdeenshire So just tell me what, what impact did the war have on you as a young girl living in Aberdeenshire? Well, um, I was still at school of course. I went to (unclear) academy and I was still at the academy and I remember, um, they used to be a reconnaissance plane come over every night you could almost set your watch by it, 8 o'clock ish and you knew, I, I knew, we all knew that the engine had a different noise to it you know. So you could 

In [8]:
assert len(sys_ts) == len(hum_ts)

In [32]:
from typing import Union, List, Mapping

class DigitsToWords(jiwer.AbstractTransform):
    def __init__(self, target=re.compile(r"\b[,.\d]+\b"), **ntw_opts):
        """
        Use inflect library's number_to_words functionality to substitute digits for corresponding strings.
        E.g. 8 o'clock -> eight o'clock.
        Note that some instances may warrant distinct treatment, e.g. $1,000 might have desired output "one thousand dollars", 
         "1944" may want to be "nineteen fourty-four".
        To enable this, only substrings matching the `target` compiled regex pattern are processed. By default, this is any
         "word" (per standard RegEx word boundaries) consisting of digits, commas and decimal points.
         
        **kwargs are passed through to the number_to_words function - see docs at https://pypi.org/project/inflect/
        Note that passing the groups parameter introduces additional commas.
        """
        self.ie = inflect.engine()
        
        self.target = target
        self.ntw_opts = ntw_opts
        
    def process_string(self, s: str, **ntw_opts):
        for m in re.finditer(self.target, s):
            repl = self.ie.number_to_words(m[0], **self.ntw_opts)
            s = s.replace(m[0],repl,1)
        return s

    def process_list(self, inp: List[str]):
        return [self.process_string(s) for s in inp]
    
    
    
class OrdinalsToWords(jiwer.AbstractTransform):
    def __init__(self, target=re.compile(r"\b(?P<numpart>[,.\d]+)(st|nd|rd|th)\b")):
        """
        Use num2words library's functionality to substitute ordinals for corresponding strings.
        E.g. 22nd -> twenty-second.
        https://pypi.org/project/num2words/
        
        If modifying the target regex, note that the group label 'numpart' is required 
         (for the numeric section which is retained and converted to ordinal words).
        """
        self.target = target
        
    def process_string(self, s: str):
        for m in re.finditer(self.target, s):
            repl = num2words(m['numpart'], ordinal=True)
            # Want to replace entire match, not just the digits (or we get "secondnd")
            s = s.replace(m[0],repl,1)
        return s
    
    def process_list(self, inp: List[str]):
        return [self.process_string(s) for s in inp]

In [84]:
# Applied to both human and system transcripts


etcdict = {'etcetera' : 'et cetera',
           'etc' : 'et cetera',
          }

# Hesitations - removed
hesdict = {r'\bum+\b' : '',
           r'\ber+(m+?)\b' : '',
           r'\buh+\b' : '',
            }

transformation = jiwer.Compose([
    jiwer.Strip(),
    
    # Similar to jiwer.RemoveKaldiNonWords() but also removing parentheticals e.g. "(unclear)"
    # NB: need non-greedy match in the middle!
    jiwer.SubstituteRegexes({r"[\[{\(\<].*?[\]}\)\>]": r" ",}),
    
    # Year processing - convert 200x to "two thousand and x"
    DigitsToWords(target = re.compile(r"\b200\d{1}\b"), group=0),
    # Other years (from 1700 on) - convert e.g. 1984 to "nineteen, eighty-four"
    # NB: place prior to removal of punctuation as introduces new commas
    DigitsToWords(target = re.compile(r"\b(17|18|19|20)\d{2}\b"), group=2, zero='oh'),
    
    # Ordinals - "1st" -> "first"
    OrdinalsToWords(),
    
    # Other numbers (as standalone words)
    # NB: things like "£1000" unchanged - might want to revisit
    DigitsToWords(),

    # Extra fix for common typo
    jiwer.SubstituteWords({'im' : "i'm", "Im" : "I'm"}),
    # E.g. "I've" -> "I have"
    jiwer.ExpandCommonEnglishContractions(),

    # etcetera, etc. -> et cetera
    jiwer.SubstituteWords(etcdict),
    
    # Hesitations
    jiwer.SubstituteRegexes(hesdict),
    
    # Hyphens appear between words; sub for spaces rather than just deleting
    jiwer.SubstituteRegexes({'[-_]':' '}),
    
    # jiwer transform uses string.punctuation, which doesn't include e.g. ’ - replace with ditching RegEx non-words
    #jiwer.RemovePunctuation(),
    jiwer.SubstituteRegexes({'[^\w\s]':''}),
    
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.RemoveEmptyStrings(),
    jiwer.ToLowerCase(),
    jiwer.Strip(),
    jiwer.SentencesToListOfWords(word_delimiter=" "),
]) 

In [85]:
# Jiwer metrics calculated at 'sentence' level; here, each sentence is the content of a single video
# NB: if the list lengths differ, everything is concatenated and then compared - so for Frank Wilson we'd be 
#  applying to the complete interview.
mlist = []

for h,s in zip(hum_ts,sys_ts):
    measures = compute_measures(h, s,
                truth_transform=transformation, 
                hypothesis_transform=transformation)
    mlist.append(measures)

In [86]:
mlist

[{'wer': 0.2049335863377609,
  'mer': 0.18685121107266436,
  'wil': 0.2646226572122907,
  'wip': 0.7353773427877093,
  'hits': 470,
  'substitutions': 49,
  'deletions': 8,
  'insertions': 51},
 {'wer': 0.13333333333333333,
  'mer': 0.1273344651952462,
  'wil': 0.1922927295309066,
  'wip': 0.8077072704690934,
  'hits': 1028,
  'substitutions': 82,
  'deletions': 15,
  'insertions': 53},
 {'wer': 0.20953757225433525,
  'mer': 0.19781718963165076,
  'wil': 0.2902982133473463,
  'wip': 0.7097017866526537,
  'hits': 588,
  'substitutions': 75,
  'deletions': 29,
  'insertions': 41},
 {'wer': 0.1975609756097561,
  'mer': 0.1875,
  'wil': 0.28026288885643347,
  'wip': 0.7197371111435665,
  'hits': 702,
  'substitutions': 89,
  'deletions': 29,
  'insertions': 44},
 {'wer': 0.2971548998946259,
  'mer': 0.275390625,
  'wil': 0.41339561307149986,
  'wip': 0.5866043869285001,
  'hits': 742,
  'substitutions': 172,
  'deletions': 35,
  'insertions': 75},
 {'wer': 0.20489690721649484,
  'mer': 0.1

In [76]:
hum_ts[0]

"Harriet Edith Frosser - was, when I joined the wrens- that was my name. Harriet where-where are you from and why did you decide you wanted to be a WREN? oh, well I've had other people ask me that and i think myself because I was auburn haired, um, pale skinned and freckly faced, I didn't think that um the ATS would suit me- the uniform you know (laughs). And I though that the navy blue would. and I think that's the only reason. Really? Because we didn't live by the sea, we were- lived inland you know. Farming stock And where are you from? Aberdeenshire So just tell me what, what impact did the war have on you as a young girl living in Aberdeenshire? Well, um, I was still at school of course. I went to (unclear) academy and I was still at the academy and I remember, um, they used to be a reconnaissance plane come over every night you could almost set your watch by it, 8 o'clock ish and you knew, I, I knew, we all knew that the engine had a different noise to it you know. So you could t

In [77]:
transformation(hum_ts[0])

['harriet',
 'edith',
 'frosser',
 'was',
 'when',
 'i',
 'joined',
 'the',
 'wrens',
 'that',
 'was',
 'my',
 'name',
 'harriet',
 'where',
 'where',
 'are',
 'you',
 'from',
 'and',
 'why',
 'did',
 'you',
 'decide',
 'you',
 'wanted',
 'to',
 'be',
 'a',
 'wren',
 'oh',
 'well',
 'i',
 'have',
 'had',
 'other',
 'people',
 'ask',
 'me',
 'that',
 'and',
 'i',
 'think',
 'myself',
 'because',
 'i',
 'was',
 'auburn',
 'haired',
 'pale',
 'skinned',
 'and',
 'freckly',
 'faced',
 'i',
 'did',
 'not',
 'think',
 'that',
 'the',
 'ats',
 'would',
 'suit',
 'me',
 'the',
 'uniform',
 'you',
 'know',
 'and',
 'i',
 'though',
 'that',
 'the',
 'navy',
 'blue',
 'would',
 'and',
 'i',
 'think',
 'that',
 'is',
 'the',
 'only',
 'reason',
 'really',
 'because',
 'we',
 'did',
 'not',
 'live',
 'by',
 'the',
 'sea',
 'we',
 'were',
 'lived',
 'inland',
 'you',
 'know',
 'farming',
 'stock',
 'and',
 'where',
 'are',
 'you',
 'from',
 'aberdeenshire',
 'so',
 'just',
 'tell',
 'me',
 'what',
 

In [78]:
transformation(sys_ts[0])

['harriet',
 'edith',
 'prosser',
 'was',
 'when',
 'i',
 'joined',
 'the',
 'friends',
 'that',
 'was',
 'my',
 'name',
 'harriet',
 'where',
 'are',
 'you',
 'from',
 'and',
 'why',
 'did',
 'you',
 'decide',
 'you',
 'want',
 'it',
 'to',
 'be',
 'a',
 'red',
 'well',
 'i',
 'have',
 'had',
 'other',
 'people',
 'ask',
 'me',
 'that',
 'and',
 'i',
 'think',
 'myself',
 'because',
 'i',
 'was',
 'open',
 'haired',
 'a',
 'pale',
 'skinned',
 'and',
 'freckle',
 'faced',
 'i',
 'did',
 'not',
 'think',
 'that',
 'the',
 'e',
 't',
 's',
 'would',
 'suit',
 'me',
 'the',
 'uniform',
 'you',
 'know',
 'and',
 'i',
 'thought',
 'that',
 'the',
 'navy',
 'blue',
 'wood',
 'and',
 'i',
 'think',
 'that',
 'is',
 'the',
 'only',
 'reason',
 'because',
 'we',
 'did',
 'not',
 'live',
 'by',
 'the',
 'sea',
 'we',
 'were',
 'lived',
 'in',
 'land',
 'you',
 'know',
 'farming',
 'stock',
 'where',
 'are',
 'you',
 'from',
 'aberdeenshire',
 'so',
 'you',
 'say',
 'what',
 'what',
 'what',
 'i

Read in metadata to obtain tags.

Want specific items for this individual, as well as the general set of key words from everyone?

In [88]:
# Grab from updated metadata CSV

META_PATH = '/home/tompickard/H_Drive/srv/studat/cdt/team2/data/legasee/metadata/'

meta_df = pd.read_csv(META_PATH+'master_metadata.csv', converters={'Priority Words': eval, 'Name Words' : eval})

# Remove Test items
meta_df = meta_df[~pd.Series(meta_df.Allocation == "Test")]

In [89]:
meta_df['Priority Words']

0      [North Africa, First Lieutenant, Navigator, WW...
2      [Naval, Rum Ration, Operation, Neptune / Overl...
3      [Russia, Naval Convoy, JW53, Ship, Russian / A...
4      [Navy, Ordinary Seaman, Mil Camp Uk, RNB Chath...
5      [Naval, Action Stations, Naval Actions, Naval ...
                             ...                        
678                                                   []
679    [Description, Camp / Accom, Co, Short Brothers...
680    [RAF Brize Norton, RAF Halton, RAF Kenley, RAF...
681    [POW, SOE, Special Operations Executive, Train...
682    [Oral History, The Royal Military Police, Band...
Name: Priority Words, Length: 675, dtype: object

In [104]:
all_words = [x for subl in meta_df['Priority Words'] for w in subl for x in re.split('[\s/]',w)]

In [175]:
# For all_words, want to:
# - strip e.g. parentheses
# - drop stopwords
# - drop "St / St."
# - drop if all numeric?
# - drop ordinals (keep for individual, drop for large list)
# - drop empty string

# - do something with things like O'Brien, Women's, Linton-on-Ouse?
#  -- transform using the same transformation as used in evaluation; though could specify a different one

def kword_prep(inlist,transform,drop_ordinal=True):
    
    outlist = []

    if drop_ordinal:
        npatt = re.compile(r"[,.\d]+(st|nd|rd|th)?")
    else:
        npatt = re.compile(r"[,.\d]+")
                
    for s in inlist:
    
        s = s.strip(' ()[]')
            
        if s.lower() in stopwords.words('english'):
            pass

        elif s.lower() in ['st', 'st.']:
            pass

        elif re.fullmatch(npatt,s):
            pass

        elif s == '':
            pass

        elif len(transform(s)) > 1:
            outlist.extend(kword_prep(transform(s)))

        else:
            outlist.extend(transform(s))
    
    return set(outlist)
    

In [144]:
all_words_clean = kword_prep(all_words,transformation)

In [187]:
weight = 3

wdict = {w : weight for w in all_words_clean}

In [156]:
# Do lookup in meta_df, confirm get 1 match
name = 'Harriet Wright'

name_meta = meta_df[meta_df.Title == name].reset_index()

assert len(name_meta == 1)

In [169]:
key_words = name_meta['Priority Words'][0].copy()
key_words.extend(name_meta['Name Words'][0])

key_words_clean = kword_prep(key_words, transformation)

In [170]:
key_words_clean

{'halesworth',
 'harriet',
 'hatston',
 'naval',
 'raf',
 'rnas',
 'royal',
 'scotland',
 'service',
 'womens',
 'wren',
 'wright',
 'wwii'}

In [201]:
key_weight = 7

kdict = {k : key_weight for k in key_words_clean}
k1dict = {k : 1 for k in key_words_clean}

In [188]:
comb_dict = wdict.copy()
comb_dict.update(kdict)

In [174]:
sys_text = ' '.join(sys_ts)

hum_text = ' '.join(hum_ts)

In [None]:
# want the following (with parameterised weights):

# - straight WER (complete xscript)
# - WWER (just own tagwords)
# - WWER (own + all tagwords)
# - KWER (just own tagwords)

# all of the above but for WIP

In [177]:
measures = compute_measures(hum_text, sys_text,
            truth_transform=transformation, 
            hypothesis_transform=transformation)

In [178]:
measures

{'wer': 0.20471231005634283,
 'mer': 0.19227068633739577,
 'wil': 0.28682864667956964,
 'wip': 0.7131713533204304,
 'hits': 5037,
 'substitutions': 658,
 'deletions': 162,
 'insertions': 379}

In [180]:
wwer_self_measures = compute_measures(hum_text, sys_text,
                truth_transform=transformation, 
                hypothesis_transform=transformation,
                weights = kdict)

In [181]:
wwer_self_measures

{'wer': 0.21770850605206432,
 'mer': 0.2048361934477379,
 'wil': 0.3098934103734359,
 'wip': 0.6901065896265641,
 'hits': 5097,
 'substitutions': 766,
 'deletions': 168,
 'insertions': 379}

In [189]:
wwer_all_measures = compute_measures(hum_text, sys_text,
                truth_transform=transformation, 
                hypothesis_transform=transformation,
                weights = comb_dict)

In [190]:
wwer_all_measures

{'wer': 0.20887765135552747,
 'mer': 0.19831955188050146,
 'wil': 0.3035882355299412,
 'wip': 0.6964117644700588,
 'hits': 6011,
 'substitutions': 898,
 'deletions': 210,
 'insertions': 379}

In [202]:
kwer_measures = compute_measures(hum_text, sys_text,
                truth_transform=transformation, 
                hypothesis_transform=transformation,
                weights = k1dict,
                default_weight = 0)

In [203]:
kwer_measures

{'wer': 0.6551724137931034,
 'mer': 0.6551724137931034,
 'wil': 0.8768472906403941,
 'wip': 0.12315270935960593,
 'hits': 10,
 'substitutions': 18,
 'deletions': 1,
 'insertions': 0}

In [204]:
kw_incidence = []

for kw in kdict.keys():
    kw_incidence.extend([(kw,transformation(hum_text).count(kw))])

In [205]:
kw_incidence

[('womens', 0),
 ('rnas', 0),
 ('service', 2),
 ('hatston', 9),
 ('wren', 5),
 ('wright', 0),
 ('halesworth', 2),
 ('royal', 0),
 ('raf', 5),
 ('scotland', 4),
 ('wwii', 0),
 ('harriet', 2),
 ('naval', 0)]