In [1]:
import pandas as pd
import jiwer
import re
import os

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Used to convert numbers to words, e.g. 8 o'clock -> eight o'clock, 1944 -> nineteen forty four
import inflect
# Inflect is more flexible, but doesn't create ordinals as words - use num2words for that
from num2words import num2words

import pyperclip as ppc

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tompickard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
%cd '/home/tompickard/MiniProject/Legasee-Oral-History/'

from measures import compute_measures

from evaluation import kword_prep, score_name

from evaluation import transform_baseline

/home/tompickard/MiniProject/Legasee-Oral-History


In [3]:
SYSPATH = os.path.expanduser("~")+'/H_Drive/srv/studat/cdt/team2'
TEST_TRAIN = 'test'
M_FOLDER = 'final_output_001'

SUB_FOLDERS = False

SHARED_WORD_WEIGHT = 3
KEY_WEIGHT = 7
TRANSFORM = transform_baseline

In [4]:
# Read metadata from CSV

META_PATH = os.path.expanduser("~")+SYSPATH+'/data/legasee/metadata/'

meta_df = pd.read_csv(META_PATH+'master_metadata.csv', converters={'Priority Words': eval, 'Name Words' : eval})

# Remove Test items
train_meta = meta_df[~pd.Series(meta_df.Allocation == "Test")]

In [5]:
# Shared keywords (from training data)

all_words = [x for subl in train_meta['Priority Words'] for w in subl for x in re.split('[\s/]',w)]
all_words_clean = kword_prep(all_words,transform_baseline)


wdict = {w : SHARED_WORD_WEIGHT for w in all_words_clean}

In [6]:
_started = 0

if SUB_FOLDERS:
    for _fn in os.scandir(SYSPATH+'/system_outputs/'+M_FOLDER):
        # Only want to process folders. Ignore any starting with .
        if _fn.is_dir() and not _fn.name.startswith('.'):
            name_df, _, _ = score_name(_fn.name,
                   meta_df,
                   SYSPATH,
                   M_FOLDER,
                   TEST_TRAIN,
                   KEY_WEIGHT,
                   wdict,
                   TRANSFORM,
                   in_folders = SUB_FOLDERS,
                  )

            if type(name_df) != type(None):
                if _started:
                    scores_df = scores_df.append(name_df)

                else:
                    _started = 1
                    scores_df = name_df.copy()
                    
else:
    for _fn in os.scandir(SYSPATH+'/system_outputs/'+M_FOLDER):
        # Get names from files
        if _fn.is_file() and not _fn.name.startswith('.') and _fn.name[-4:] == '.txt':
            _name = _fn.name[:-4]
            name_df, _, _ = score_name(_name,
                   meta_df,
                   SYSPATH,
                   M_FOLDER,
                   TEST_TRAIN,
                   KEY_WEIGHT,
                   wdict,
                   TRANSFORM,
                   in_folders = SUB_FOLDERS,
                  )
            
            if type(name_df) != type(None):
                if _started:
                    scores_df = scores_df.append(name_df)

                else:
                    _started = 1
                    scores_df = name_df.copy()
                    
scores_df

Unnamed: 0_level_0,Keywords,Unweighted,Unweighted,Unweighted,Unweighted,Unweighted,Unweighted,Unweighted,Weighted (own keywords),Weighted (own keywords),...,Weighted (own + shared keywords),Weighted (own + shared keywords),Weighted (own + shared keywords),Keywords (own) only,Keywords (own) only,Keywords (own) only,Keywords (own) only,Keywords (own) only,Keywords (own) only,Keywords (own) only
Unnamed: 0_level_1,Incidence,deletions,hits,insertions,mer,substitutions,wer,wip,wer,mer,...,substitutions,deletions,insertions,wer,mer,wip,hits,substitutions,deletions,insertions
Gordon Hooton,"[(east, 2), (home, 16), (description, 0), (cha...",148.0,4911.0,433.0,0.1789,489.0,0.192862,0.745266,0.178778,0.167143,...,589.0,172.0,433.0,0.0625,0.0625,0.894886,105.0,5.0,2.0,0.0
Irene Bellamy,"[(dockyard, 0), (initial, 0), (childhood, 0), ...",234.0,7090.0,188.0,0.107165,429.0,0.109764,0.841274,0.111423,0.109205,...,673.0,298.0,188.0,0.12,0.12,0.790204,220.0,25.0,5.0,0.0
Joan Field,"[(initial, 0), (description, 0), (field, 1), (...",208.0,5603.0,539.0,0.191953,584.0,0.208131,0.729867,0.220416,0.204808,...,872.0,258.0,539.0,0.336283,0.336283,0.469611,75.0,31.0,7.0,0.0
Joe Pitcher,"[(initial, 0), (chatham, 7), (training, 0), (g...",224.0,5218.0,436.0,0.210709,733.0,0.225587,0.690358,0.220319,0.207312,...,997.0,242.0,436.0,0.178295,0.178295,0.6752,106.0,23.0,0.0,0.0
John Woodward,"[(leave, 4), (acoustic, 1), (description, 0), ...",71.0,2727.0,528.0,0.221524,177.0,0.26084,0.728343,0.23279,0.203136,...,259.0,81.0,528.0,0.102804,0.102804,0.812555,96.0,10.0,1.0,0.0
Rodney Newham,"[(dockyard, 11), (casualty, 0), (repair, 0), (...",89.0,2392.0,162.0,0.165096,222.0,0.174991,0.76253,0.157328,0.149654,...,266.0,109.0,162.0,0.052632,0.052632,0.909474,72.0,3.0,1.0,0.0
Vic Ould,"[(dockyard, 0), (initial, 0), (carron, 2), (ch...",322.0,11035.0,816.0,0.168112,1092.0,0.179131,0.755745,0.177066,0.167771,...,1532.0,472.0,816.0,0.165789,0.165789,0.73051,317.0,45.0,18.0,0.0


In [7]:
scores_df.to_csv(SYSPATH+'/system_outputs/'+M_FOLDER+'_'+'evaluation_scores.tsv',sep='\t')

In [8]:
wer = scores_df['Unweighted','wer'].mean()
wwer = scores_df['Weighted (own + shared keywords)','wer'].mean()
kwer = scores_df['Keywords (own) only','wer'].mean()

In [9]:
print('WER: ', wer , '\n', 'WWER: ', wwer, '\n', 'KWER: ',kwer)

WER:  0.1930437990349416 
 WWER:  0.171211773885281 
 KWER:  0.14547179291906495
