In [2]:
import glob
import re
from pprint import pprint
import spacy
from spacy.lang.en import English
import random
import os

import torch
from torch.utils.data import Dataset, DataLoader
import nltk

In [3]:
DATA_DIR = 'data/'
SWB_DIR = os.path.join(DATA_DIR, 'swb_ms98_transcriptions/')
TRAIN_PATH = os.path.join(DATA_DIR, 'train.tsv')
TEST_PATH = os.path.join(DATA_DIR, 'test.tsv')


SILENCE = '<silence>'
NLS = [
    'ah',
    'eh', # pronouned 'eh'
    'eh', # pronouned 'ey'
    'hm',
    'huh',
    'huh-uh',
    'hum-um',
    'ooh',
    'uh',
    'uh-huh',
    'uh-hum',
    'uh-oh',
    'um',
    'um-hum',
]

PADDING_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNK>'
START_TOKEN = '<SOS>'
END_TOKEN = '<EOS>'

device = torch.device('cpu')

In [4]:
hyperparameters = {
    'batch_size': 16,
    'embedding_dim': 256,
    'lstm_out_dim': 512,
    'epochs': 150,
    'learning_rate': 0.001
}

In [5]:
class UtteranceCollector():
    ANNOTATIONS = [
        #     r'\[silence\]', # may be also a sign of hesitation
        r'\[noise\]',
        r'\[laughter\]',
        r'\[vocalized-noise\]'
    ]

    def __init__(self, path, max_number_files=-1) -> None:
        # nlp = English()
        # nlp.add_pipe("sentencizer")

        self.utterances = []
        for file_index, filename in enumerate(glob.iglob(os.path.join(path + '**/*trans*'), recursive=True)):
            if file_index == max_number_files:
                break

            folders = filename.split('/')
            dialogue_id = folders[-2]
            dialogue_partner = folders[-1][folders[-1].find(dialogue_id) + len(dialogue_id)]

            with open(filename, 'r') as f:
                saved_utterances = []
                for line in f:
                    utterance = self.cleanse_utterance(line)
                    self.utterances.append({
                            'dialogue_id': dialogue_id,
                            'dialogue_partner': dialogue_partner,
                            'utterance': utterance
                        })
                    # if utterance == SILENCE and len(saved_utterances) != 0:
                    #     # doc = nlp(' '.join(saved_utterances))
                    #     # for sentence in doc.sents:
                    #     #     utterances.append({
                    #     #         'dialogue_id': dialogue_id,
                    #     #         'dialoge_partner': dialogue_partner,
                    #     #         'utterance': sentence.text
                    #     #     })
                    #     self.utterances.append({
                    #             'dialogue_id': dialogue_id,
                    #             'dialogue_partner': dialogue_partner,
                    #             'utterance': ' '.join(saved_utterances)
                    #         })
                    #     saved_utterances = []

                    # elif utterance != '' and utterance != SILENCE:
                    #     saved_utterances.append(utterance)

    def cleanse_utterance(self, utterance: str):
        utterance = utterance.rstrip().split(' ', maxsplit=3)[-1]

        # remove annotations
        utterance = re.sub(fr'({"|".join(self.ANNOTATIONS)})', '', utterance)

        # replace anomalous words.
        # E.g.: "... [bettle/better] ..." -> "... better ...".
        # Also prevent duplications: "... [bettle/better] better ..." -> "... better ..."
        utterance = re.sub(r"(^| )\[(.*?)\/(?P<replace>.*?)\]( (?P=replace))?( |$|-)", lambda x: f' {x.group(3)} ', utterance)

        # replace words containing laughter.
        # E.g.: "... [laughter-alone] ..." -> "... alone ..."
        utterance = re.sub(r"(^| )\[laughter-(.*?)\]( |$|-)", lambda x: f' {x.group(2)} ', utterance)

        # exclude too complicated annotations to replace automatically
        if utterance.find(' [') > -1:
            return ''
        
        # replace partial word pronounciations
        # E.g. "... pla[stic]- ..." -> "... plastic- ..."
        utterance = re.sub(r'\[silence\]', SILENCE, utterance)
        utterance = re.sub(r'(\[|\])', '', utterance)

        # remove duplicate blanks
        utterance = re.sub(r' +', ' ', utterance).rstrip()

        return utterance   


In [6]:
def contains_nls(utterance: str):
    return any(nls in utterance['utterance'] for nls in [*NLS, SILENCE])

def contains_repetition(utterance: str, ngram=1):
    split_utterance = utterance['utterance'].split(' ')
    # include partial word pronounciations
    split_utterance = [word.rstrip('-') for word in split_utterance]
    zipped = list(zip(*[split_utterance[i:] for i in range(ngram)]))
    return any(zipped[index] == zipped[index - ngram] for index in range(ngram, len(zipped)))

In [7]:
utterances = UtteranceCollector(SWB_DIR).utterances

print('Total:', len(utterances))

contain_nls = list(filter(lambda x: contains_nls(x), utterances))
print('contain nls:', len(contain_nls))

for repetitions in range(1, 10):
    contain_repetition = list(filter(lambda x: contains_repetition(x, repetitions), utterances))
    print(f'contain {repetitions}-gram repetitions:', len(contain_repetition))
    print(random.choice(contain_repetition)['utterance'])

print('Lengths:')
lengths = {}
for utterance in utterances:
    utterance_length = len(utterance['utterance'].split(' '))
    lengths.setdefault(utterance_length, []).append(utterance)
for length, utts in sorted(lengths.items()):
    contain_nls = list(filter(lambda x: contains_nls(x), utts))
    contain_repetition = list(filter(lambda x: contains_repetition(x), utts))
    print(f'{length}:', len(utts), len(contain_nls) + len(contain_repetition), len(contain_nls), len(contain_repetition))


Total: 391593
contain nls: 262547
contain 1-gram repetitions: 47199
that's that's wonderful
contain 2-gram repetitions: 9994
how they react to that and everything but so i think i think you're right though they're getting little better and better sooner- sooner- you know sooner or later they're going to make some major breakthroughs
contain 3-gram repetitions: 1823
yeah well- we didn't have- we didn't have any that were that
contain 4-gram repetitions: 428
can't stand Jimmy Johnson or Jerry well uh you know i was just trying i was just trying to think of uh who who our coaches of Houston Oilers you probably know
contain 5-gram repetitions: 100
just to see the show just to see the show right
contain 6-gram repetitions: 21
oh yeah i lost twenty five pounds yeah i lost twenty five pounds now you can do the same oh yeah
contain 7-gram repetitions: 9
oh gosh yeah how do you usually cook your deer how do you usually cook your deer
contain 8-gram repetitions: 5
how long would it be in the mic

In [8]:
def split_data(source_path, target_path_train, target_path_test, number_files=-1, train_split=0.8):
    max_length_utterance = -1
    
    utterances = UtteranceCollector(source_path, number_files).utterances

    
    delimiter = int(len(utterances) * train_split)

    with open(target_path_train, 'w') as target_train:
        for utterance in utterances[:delimiter]:
            tokenized = nltk.word_tokenize(utterance['utterance'])
            max_length_utterance = max(max_length_utterance, len(tokenized))
            target_train.write(f"{utterance['utterance']}\t{utterance['dialogue_id']}\t{utterance['dialogue_partner']}\n")
    with open(target_path_test, 'w') as target_test:
        for utterance in utterances[delimiter:]:
            tokenized = nltk.word_tokenize(utterance['utterance'])
            max_length_utterance = max(max_length_utterance, len(tokenized))
            target_test.write(f"{utterance['utterance']}\t{utterance['dialogue_id']}\t{utterance['dialogue_partner']}\n")

    return max_length_utterance

In [9]:
max_length_utterance = split_data(SWB_DIR, TRAIN_PATH, TEST_PATH)
max_length_utterance

88

In [10]:
class NLSDataset(Dataset):
    def __init__(self, path, max_length_utterance=-1, dataset=None) -> None:
        super().__init__()
        utterances = self._read_file(path)

        if dataset is None:
            self.max_length_source = max_length_utterance + 2
            self.max_length_target = max_length_utterance + 2
            vocab = {PADDING_TOKEN, UNKNOWN_TOKEN, START_TOKEN, END_TOKEN, *NLS}

            for utterance in utterances:
                target = nltk.word_tokenize(utterance['utterance'])
                vocab.update(target)

                source = [word for word in target if word not in NLS]

                self.max_length_source = max(self.max_length_source, len(source))
                self.max_length_target = max(self.max_length_target, len(target))

            self.vocab = {word: index for index, word in enumerate(list(vocab))}
        else:
            self.vocab = dataset.vocab
            self.max_length_source = dataset.max_length_source
            self.max_length_target = dataset.max_length_target

        self.samples = []
        for utterance in utterances:
            tokenized = [START_TOKEN, *nltk.word_tokenize(utterance['utterance']), END_TOKEN]

            target = [self.get_encoded_word(word) for word in tokenized]
            source = [self.get_encoded_word(word) for word in tokenized if word not in NLS]
            
            target.extend([self.get_encoded_word(PADDING_TOKEN)] * (self.max_length_target - len(target)))
            source.extend([self.get_encoded_word(PADDING_TOKEN)] * (self.max_length_source - len(source)))

            try:
                self.samples.append({
                    'dialogue_id': utterance['dialogue_id'],
                    'dialogue_partner': utterance['dialogue_partner'],
                    'source': torch.tensor(source),
                    'target': torch.tensor(target),
                })
            except:
                print(utterance) 

    def _read_file(self, path):
        utterances = []
        with open(path, 'r') as f:
            for line in f:
                utterance, dialogue_id, dialogue_partner = line.rstrip().split('\t')
                utterances.append({
                    'utterance': utterance,
                    'dialogue_id': dialogue_id,
                    'dialogue_partner': dialogue_partner
                })
        
        return utterances

    def get_encoded_word(self, word) -> int:
        if word in self.vocab:
            return self.vocab[word]
        else:
            return self.vocab[UNKNOWN_TOKEN]

    def __getitem__(self, item) -> dict:
        return self.samples[item]

    def __len__(self) -> int:
        return len(self.samples)
    

    

In [11]:
dataset = NLSDataset(TRAIN_PATH, max_length_utterance)
len(dataset)

In [None]:
dataset[2]

{'dialogue_id': '2121',
 'dialogue_partner': 'A',
 'source': tensor([319, 469, 472, 331, 331, 162, 211, 401, 524, 424, 524, 424, 330, 474,
          80, 210,  78, 274, 524, 424, 346, 166, 218, 452, 342, 312,  82, 573,
         330, 474,  80,  70, 337, 405,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  2

In [None]:
def dataloader(path_train, path_test, batch_size):
    train_dataset = NLSDataset(path_train, max_length_utterance)
    test_dataset = NLSDataset(path_test, max_length_utterance, train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=True)

    return train_dataloader, test_dataloader

In [None]:
train_dataloader, test_dataloader = dataloader(TRAIN_PATH, TEST_PATH, hyperparameters['batch_size'])

In [None]:
train_dataloader.dataset[2]

{'dialogue_id': '2121',
 'dialogue_partner': 'A',
 'source': tensor([319, 469, 472, 331, 331, 162, 211, 401, 524, 424, 524, 424, 330, 474,
          80, 210,  78, 274, 524, 424, 346, 166, 218, 452, 342, 312,  82, 573,
         330, 474,  80,  70, 337, 405,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
          28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  2