In [53]:
import glob
import re
from pprint import pprint
from traceback import print_tb
# import spacy
# from spacy.lang.en import English

root_dir = 'data/swb_ms98_transcriptions/'
ANNOTATIONS = [
    #     r'\[silence\]', # may be also a sign of hesitation
    r'\[noise\]',
    r'\[laughter\]',
    r'\[vocalized-noise\]'
]

SILENCE = '<silence>'

nlp = English()
nlp.add_pipe("sentencizer")

def cleanse_utterance(utterance: str):
    utterance = utterance.rstrip().split(' ', maxsplit=3)[-1]

    # remove annotations
    utterance = re.sub(fr'({"|".join(ANNOTATIONS)})', '', utterance)

    # replace anomalous words.
    # E.g.: "... [bettle/better] ..." -> "... better ...".
    # Also prevent duplications: "... [bettle/better] better ..." -> "... better ..."
    utterance = re.sub(r"(^| )\[(.*?)\/(?P<replace>.*?)\]( (?P=replace))?( |$|-)", lambda x: f' {x.group(3)} ', utterance)

    # replace words containing laughter.
    # E.g.: "... [laughter-alone] ..." -> "... alone ..."
    utterance = re.sub(r"(^| )\[laughter-(.*?)\]( |$|-)", lambda x: f' {x.group(2)} ', utterance)

    # to complicated annotations to replace automatically
    if utterance.find(' [') > -1:
        return ''
    
    # replace partial word pronounciations
    # E.g. "... pla[stic]- ..." -> "... plastic- ..."
    utterance = re.sub(r'\[silence\]', SILENCE, utterance)
    utterance = re.sub(r'(\[|\])', '', utterance)

    # remove duplicate blanks
    utterance = re.sub(r' +', ' ', utterance).rstrip()

    return utterance

utterances = []
for file_index, filename in enumerate(glob.iglob(root_dir + '**/*trans*', recursive=True)):
    folders = filename.split('/')
    dialogue_id = folders[-2]
    dialogue_partner = folders[-1][folders[-1].find(dialogue_id) + len(dialogue_id)]

    with open(filename, 'r') as f:
        saved_utterances = []
        for index, line in enumerate(f):
            utterance = cleanse_utterance(line)

            if utterance == SILENCE and len(saved_utterances) != 0:
                # doc = nlp(' '.join(saved_utterances))
                # for sentence in doc.sents:
                #     utterances.append({
                #         'dialogue_id': dialogue_id,
                #         'dialoge_partner': dialogue_partner,
                #         'utterance': sentence.text
                #     })
                utterances.append({
                        'dialogue_id': dialogue_id,
                        'dialoge_partner': dialogue_partner,
                        'utterance': ' '.join(saved_utterances)
                    })
                saved_utterances = []

            elif utterance != '' and utterance != SILENCE:
                saved_utterances.append(utterance)


In [54]:
NLS = [
    'ah',
    'eh', # pronouned 'eh'
    'eh', # pronouned 'ey'
    'hm',
    'huh',
    'huh-uh',
    'hum-um',
    'ooh',
    'uh',
    'uh-huh',
    'uh-hum',
    'uh-oh',
    'um',
    'um-hum',
    SILENCE
]


In [55]:
def contains_nls(utterance: str):
    return any(nls in utterance['utterance'] for nls in NLS)

def contains_repetition(utterance: str, ngram=1):
    split_utterance = utterance['utterance'].split(' ')
    # include partial word pronounciations
    split_utterance = [word.rstrip('-') for word in split_utterance]
    zipped = list(zip(*[split_utterance[i:] for i in range(ngram)]))
    return any(zipped[index] == zipped[index - ngram] for index in range(ngram, len(zipped)))

In [56]:
import random


print('Total:', len(utterances))

contain_nls = list(filter(lambda x: contains_nls(x), utterances))
print('contain nls:', len(contain_nls))

for repetitions in range(1, 10):
    contain_repetition = list(filter(lambda x: contains_repetition(x, repetitions), utterances))
    print(f'contain {repetitions}-gram repetitions:', len(contain_repetition))
    print(random.choice(contain_repetition)['utterance'])

print('Lengths:')
lengths = {}
for utterance in utterances:
    utterance_length = len(utterance['utterance'].split(' '))
    lengths.setdefault(utterance_length, []).append(utterance)
for length, utts in sorted(lengths.items()):
    contain_nls = list(filter(lambda x: contains_nls(x), utts))
    contain_repetition = list(filter(lambda x: contains_repetition(x), utts))
    print(f'{length}:', len(utts), len(contain_nls) + len(contain_repetition), len(contain_nls), len(contain_repetition))


Total: 121098
contain nls: 84262
contain 1-gram repetitions: 31377
yeah isn't isn't Maryland where the famous Willy Horton case was
contain 2-gram repetitions: 8943
yeah that'd been weird  seems like a that seems like a long time ago i'm i'm kind of i don't know i'm into a lot of different kinds of music too i like that- kind- that kind of jazz and um a lot of different kinds of rock music not really pop type music you know like radio- a lot of radio play like the stuff i listen to but it's not you know strictly alternative either it's kind of i like um let's see there's some local bands from Atlanta like have you ever heard of uh Fuller for Now they're they're real new they've they've they've gotten in Rolling Stone a couple times but uh they just put an album out um the Red Hot Chili Peppers uh Fishbone they've got- they've got some horns i i like- i like bands with horns that rock with horns it sounds really neat
contain 3-gram repetitions: 1896
yeah i feel that you should be you kn