Scripts for converting output from speech-to-text tools into plain text for assessing word error rate (WER). (Mozilla Deep Speech and Kaldi output plain text by default.)

## AWS JSON to plain text

In [4]:
import json

# change inputfile for each sample transcript
inputfile = 'aws-transcribe/women-and-aids/mao_204732_v01_high2.json'
with open(inputfile, 'r') as f:
    data = json.load(f)
    
sourcefile = inputfile[0:-5]

transcript = data['results']['transcripts'][0]['transcript']

# output txt file
zfile = sourcefile + '_plaintxt.txt'
z = open(zfile,'w')
z.write(transcript)
z.close()

## Google STT to plain text

In [0]:
import json

# change inputfile for each sample transcript
inputfile = 'google-speech-to-text/astin-patten/MDPI_40000002758839_01_access_mono_speakers.json'
with open(inputfile, 'r') as f:
    data = json.load(f)

# create a filename stem for the output file
sourcefile = inputfile[0:-5]

words = [d['alternatives'][0]['transcript'] for d in data['response']['results']]
transcript = ' '.join(words)

# output txt file
zfile = sourcefile + '_plaintxt.txt'
z = open(zfile,'w')
z.write(transcript)
z.close()

## CMU Sphinx4 to plain text

In [3]:
import re

inputfile = 'cmu-sphinx/women-and-aids/mao_204732_v01_high_mono.txt'

# create a filename stem for the output file
sourcefile = inputfile[0:-4]

#define regex for timestamp on non-transcript lines
pattern = re.compile("\d{2}:\d{2}:\d{2}.\d{3}.*")

words = []
with open(inputfile, 'r') as f:
    for line in f:
        if 'trieNgramModel       LM' in line:
            text = next(f)
            if not pattern.match(text.strip()) and not text.isspace():
                words.append(text.replace('\n', ''))
transcript = ' '.join(words)

zfile = sourcefile + '_plaintxt.txt'
z = open(zfile,'w')
z.write(transcript)
z.close()


## AWS to VTT with speakers

In [25]:
#still needs a little work getting the timecodes formatted right

import json
from datetime import timedelta
from decimal import Decimal

#format time to 3 decimal places
def format_time(t):
    td = str(timedelta(seconds=float(t)))
    #trim milliseconds
    mssplit = td.split('.')
    if len(mssplit) == 2:
        strtd = td[:-3]
    #zero fill ms
    else:
        strtd = mssplit[0] + '.000'
        #zero fill hour
    hsplit = strtd.split(':')
    if len(hsplit[0]) == 1:
        strtd = '0' + strtd
    return strtd

# change inputfile for each sample transcript
inputfile = 'aws-transcribe/women-and-aids/mao_204732_v01_high.json'
with open(inputfile, 'r') as f:
    data = json.load(f)

# create a filename stem for the output file
sourcefile = inputfile[0:-5]

segments = data['results']['speaker_labels']['segments']
timedwords = data['results']['items']
parts = []
for s in segments:
    part = {}
    words = []
    part['speaker'] = s['speaker_label']
    start_time = Decimal(s['start_time'])
    end_time = Decimal(s['end_time'])
    for t in timedwords:
        if 'start_time' in t:
            wordtime = Decimal(t['start_time'])
        if (wordtime >= start_time) and (wordtime < end_time):
            words.append(t['alternatives'][0]['content'])                
            #print(str(t['start_time']) + '/' + str(end_time) + ' ' + t['name'])
        else:
            pass
    for w in words:
        if w == '.':
            index = words.index(w)
            if index != 0:
                words[index-1] = words[index-1] + w
                words.remove(w)
    part['start_time'] = format_time(start_time)
    part['end_time'] = format_time(end_time)
    part['words'] = ' '.join(words)
    parts.append(part)

zfile = sourcefile + '_speakers.vtt'
with open(zfile, 'w') as v:
    v.write('WEBVTT\n\n')
    for p in parts:
        cue = p['start_time'] + ' --> ' + p['end_time'] + '\n'
        text = '<v ' + p['speaker'] + '> ' + p['words'] + '\n'
        v.write(cue)
        v.write(text)
        v.write('\n')

### Kaldi to VTT

In [18]:
#Kaldi doesn't have segments, so group words by connecting words that are less that two seconds
#apart and limit length of word strings by x seconds

import json
from datetime import timedelta
from decimal import Decimal

#format time to 3 decimal places
def format_time(t):
  td = str(timedelta(seconds=float(t)))
  #trim milliseconds
  mssplit = td.split('.')
  if len(mssplit) == 2:
    strtd = td[:-3]
  #zero fill ms
  else:
    strtd = mssplit[0] + '.000'
  #zero fill hour
  hsplit = strtd.split(':')
  if len(hsplit[0]) == 1:
    strtd = '0' + strtd
  return strtd

# change inputfile for each sample transcript
inputfile = 'kaldi/student-admin/MDPI_40000002604629_01_access-trim_16kHz.json'
with open(inputfile, 'r') as f:
    data = json.load(f)

# create a filename stem for the output file
sourcefile = inputfile[0:-5]

parts = []
part = {'start': 0, 'words': [], 'end': 0}
wordtime = 0
#group words into parts
for w in data['words']:
    if (w['time'] - wordtime > 2) or (part['end'] - part['start'] > 10):
        if len(part['words']) > 0:
            parts.append(part)
        part = {'start': w['time'], 'words': [w['word']], 'end': w['time'] + float(w['duration'])}
    else:
        part['words'].append(w['word'])
        part['end'] = w['time'] + float(w['duration'])
    wordtime = w['time']

#format parts as vtt
zfile = sourcefile + '.vtt'
with open(zfile, 'w') as v:
    v.write('WEBVTT\n\n')
    for p in parts:
        p['start'] = format_time(p['start'])
        p['end'] = format_time(p['end'])
        cue = p['start'] + ' --> ' + p['end'] + '\n'
        text = ' '.join(p['words']) + '\n'
        v.write(cue)
        v.write(text)
        v.write('\n')