# 0 - Imports/Constants

In [59]:
import glob
import re
import random
import os
import math
import sys
from pprint import pprint

import numpy as np

import spacy
from spacy.lang.en import English

import torch
import torch.nn as nn
from torch import optim
from torch.nn import CrossEntropyLoss, KLDivLoss, NLLLoss
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe

import nltk

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
DATA_DIR = 'data/'
SWB_DIR = os.path.join(DATA_DIR, 'swb_ms98_transcriptions/')

SILENCE = '<silence>'
NLS = [
    'ah',
    'eh', # pronouned 'eh'
    'eh', # pronouned 'ey'
    'hm',
    'huh',
    'huh-uh',
    'hum-um',
    'ooh',
    'uh',
    'uh-huh',
    'uh-hum',
    'uh-oh',
    'um',
    'um-hum',
]

PADDING_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNK>'
START_TOKEN = '<SOS>'
END_TOKEN = '<EOS>'
NO_NLS_TOKEN = '<NO-NLS>'

device = torch.device('cuda:0')

In [3]:
glove_vectors = GloVe(name='6B', dim=300)

In [4]:
glove_vectors.vectors = torch.cat((glove_vectors.vectors, torch.rand(4, 300)))

glove_vectors.itos.append(START_TOKEN)
glove_vectors.stoi[START_TOKEN] = 400000
glove_vectors.itos.append(END_TOKEN)
glove_vectors.stoi[END_TOKEN] = 400001
glove_vectors.itos.append(PADDING_TOKEN)
glove_vectors.stoi[PADDING_TOKEN] = 400002
glove_vectors.itos.append(UNKNOWN_TOKEN)
glove_vectors.stoi[UNKNOWN_TOKEN] = 400002

# for better performance
glove_vocab = set(glove_vectors.itos)
glove_vectors.vectors.size()

torch.Size([400004, 300])

In [5]:
hyperparameters = {
    'batch_size': 64,
    'embedding_dim': 512,
    'lstm_out_dim': 350,
    'dropout_prob': 0.2,
    'epochs': 20,
    'glove_training_epoch': 10,
    'learning_rate': 0.002
}

# 1 - Preparing Data

In [2]:
class UtteranceCollector():
    ANNOTATIONS = [
        #     r'\[silence\]', # may be also a sign of hesitation
        r'\[noise\]',
        r'\[laughter\]',
        r'\[vocalized-noise\]'
    ]

    def __init__(self, path, max_number_files=-1) -> None:
        # nlp = English()
        # nlp.add_pipe("sentencizer")

        self.utterances = []
        for file_index, filename in enumerate(glob.iglob(os.path.join(path + '**/*trans*'), recursive=True)):
            if file_index == max_number_files:
                break

            folders = filename.split('/')
            dialogue_id = folders[-2]
            dialogue_partner = folders[-1][folders[-1].find(dialogue_id) + len(dialogue_id)]

            with open(filename, 'r') as f:
                saved_utterances = []
                for line in f:
                    utterance = self.cleanse_utterance(line)
                    if utterance != SILENCE and utterance != '':
                        self.utterances.append({
                                'dialogue_id': dialogue_id,
                                'dialogue_partner': dialogue_partner,
                                'utterance': utterance
                            })
                    # if utterance == SILENCE and len(saved_utterances) != 0:
                    #     # doc = nlp(' '.join(saved_utterances))
                    #     # for sentence in doc.sents:
                    #     #     utterances.append({
                    #     #         'dialogue_id': dialogue_id,
                    #     #         'dialoge_partner': dialogue_partner,
                    #     #         'utterance': sentence.text
                    #     #     })
                    #     self.utterances.append({
                    #             'dialogue_id': dialogue_id,
                    #             'dialogue_partner': dialogue_partner,
                    #             'utterance': ' '.join(saved_utterances)
                    #         })
                    #     saved_utterances = []

                    # elif utterance != '' and utterance != SILENCE:
                    #     saved_utterances.append(utterance)

    def cleanse_utterance(self, utterance: str):
        utterance = utterance.rstrip().split(' ', maxsplit=3)[-1]

        # remove annotations
        utterance = re.sub(fr'({"|".join(self.ANNOTATIONS)})', '', utterance)

        # replace anomalous words.
        # E.g.: "... [bettle/better] ..." -> "... better ...".
        # Also prevent duplications: "... [bettle/better] better ..." -> "... better ..."
        utterance = re.sub(r"(^| )\[(.*?)\/(?P<replace>.*?)\]( (?P=replace))?( |$|-)", lambda x: f' {x.group(3)} ', utterance)

        # replace words containing laughter.
        # E.g.: "... [laughter-alone] ..." -> "... alone ..."
        utterance = re.sub(r"(^| )\[laughter-(.*?)\]( |$|-)", lambda x: f' {x.group(2)} ', utterance)

        # exclude too complicated annotations to replace automatically
        if utterance.find(' [') > -1:
            return ''
        
        # replace partial word pronounciations
        # E.g. "... pla[stic]- ..." -> "... plastic- ..."
        utterance = re.sub(r'\[silence\]', SILENCE, utterance)
        utterance = re.sub(r'(\[|\])', '', utterance)

        # remove duplicate blanks
        utterance = re.sub(r' +', ' ', utterance).strip()

        return utterance   


In [3]:
def contains_nls(utterance: str):
    return any(nls in utterance['utterance'] for nls in [*NLS, SILENCE])

def contains_repetition(utterance: str, ngram=1):
    split_utterance = utterance['utterance'].split(' ')
    # include partial word pronounciations
    split_utterance = [word.rstrip('-') for word in split_utterance]
    zipped = list(zip(*[split_utterance[i:] for i in range(ngram)]))
    return any(zipped[index] == zipped[index - ngram] for index in range(ngram, len(zipped)))

In [4]:
utterances = UtteranceCollector(SWB_DIR).utterances

print('Total:', len(utterances))

contain_nls = list(filter(lambda x: contains_nls(x), utterances))
print('contain nls:', len(contain_nls))

for repetitions in range(1, 10):
    contain_repetition = list(filter(lambda x: contains_repetition(x, repetitions), utterances))
    print(f'contain {repetitions}-gram repetitions:', len(contain_repetition))
    print('\t->', random.choice(contain_repetition)['utterance'])

print('Lengths:')
lengths = {}
for utterance in utterances:
    utterance_length = len(utterance['utterance'].split(' '))
    lengths.setdefault(utterance_length, []).append(utterance)
for length, utts in sorted(lengths.items()):
    contain_nls = list(filter(lambda x: contains_nls(x), utts))
    contain_repetition = list(filter(lambda x: contains_repetition(x), utts))
    print(f'{length}:', len(utts), len(contain_nls) + len(contain_repetition), len(contain_nls), len(contain_repetition))


NameError: name 'SWB_DIR' is not defined

In [None]:
def split_data(source_path, target_path_train, target_path_test, number_files=-1, train_split=0.8):
    max_length_utterance = -1
    
    utterances = UtteranceCollector(source_path, number_files).utterances

    delimiter = int(len(utterances) * train_split)

    with open(target_path_train, 'w') as target_train:
        for utterance in utterances[:delimiter]:
            tokenized = nltk.word_tokenize(utterance['utterance'])
            max_length_utterance = max(max_length_utterance, len(tokenized))
            target_train.write(f"{utterance['utterance']}\t{utterance['dialogue_id']}\t{utterance['dialogue_partner']}\n")
    with open(target_path_test, 'w') as target_test:
        for utterance in utterances[delimiter:]:
            tokenized = nltk.word_tokenize(utterance['utterance'])
            max_length_utterance = max(max_length_utterance, len(tokenized))
            target_test.write(f"{utterance['utterance']}\t{utterance['dialogue_id']}\t{utterance['dialogue_partner']}\n")

    return max_length_utterance

In [None]:
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_20.tsv'), os.path.join(DATA_DIR, 'test_20.tsv'), number_files=20)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_100.tsv'), os.path.join(DATA_DIR, 'test_100.tsv'), number_files=100)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_500.tsv'), os.path.join(DATA_DIR, 'test_500.tsv'), number_files=500)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_1000.tsv'), os.path.join(DATA_DIR, 'test_1000.tsv'), number_files=1000)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_all.tsv'), os.path.join(DATA_DIR, 'test_all.tsv'))

88

# 2 - Loading Data

In [33]:
class NLSDataset(Dataset):
    def __init__(self, path, min_length_utterance=1, dataset=None) -> None:
        super().__init__()
        utterances = self._read_file(path)

        if dataset is None:
            source_vocab = {UNKNOWN_TOKEN, START_TOKEN, END_TOKEN}
            pos_vocab = {UNKNOWN_TOKEN, START_TOKEN, END_TOKEN}
            for utterance in utterances:
                tokenized = nltk.word_tokenize(utterance['utterance'])
                filtered = [word for word in tokenized if word not in NLS]
                pos_tagged = nltk.pos_tag(filtered)

                source_vocab.update(filtered)
                pos_vocab.update([tag[1] for tag in pos_tagged])

            # PADDING_TOKEN will have index 0
            self.vocab = {word: index for index, word in enumerate([PADDING_TOKEN, *list(source_vocab)])}
            self.nls_vocab = {nls: index for index, nls in enumerate([PADDING_TOKEN, *list({*NLS, NO_NLS_TOKEN})])}
            self.pos_vocab = {pos: index for index, pos in enumerate([PADDING_TOKEN, *list(pos_vocab)])}
        else:
            self.vocab = dataset.vocab
            self.nls_vocab = dataset.nls_vocab
            self.pos_vocab = dataset.pos_vocab

        self.samples = []
        for utterance in utterances:
            tokenized = nltk.word_tokenize(utterance['utterance'])

            if len(tokenized) >= min_length_utterance and any(nls in tokenized for nls in NLS):
                tokenized.insert(0, START_TOKEN)
                tokenized.append(END_TOKEN)

                source_utterance = []
                nls_predictions = []
                for word in tokenized:
                    if word not in NLS:
                        source_utterance.append(word)
                        nls_predictions.append(NO_NLS_TOKEN)
                    else:
                        nls_predictions[-1] = word

                pos_tagged = nltk.pos_tag(source_utterance[1:-1])
                pos_tags = [START_TOKEN, *[tag[1] for tag in pos_tagged], END_TOKEN]
                
                encoded_glove = [glove_vectors.stoi[word] if word in glove_vocab else glove_vectors.stoi[UNKNOWN_TOKEN] for word in source_utterance]
                encoded_pos = [self.get_encoded_pos(pos) for pos in pos_tags]
                encoded_source_utterance = [self.get_encoded_word(word) for word in source_utterance]
                encoded_nls_predictions = [self.get_encoded_nls(nls) for nls in nls_predictions]
                
                self.samples.append({
                    'dialogue_id': utterance['dialogue_id'],
                    'dialogue_partner': utterance['dialogue_partner'],
                    'utterance': utterance['utterance'],
                    'tokenized_utterance': tokenized,
                    'tokenized_utterance_without_nls': source_utterance,
                    'pos_tags': pos_tags,
                    'glove': torch.tensor(encoded_glove),
                    'pos': torch.tensor(encoded_pos),
                    'source': torch.tensor(encoded_source_utterance),
                    'nls': torch.tensor(encoded_nls_predictions),
                })

    def _read_file(self, path):
        utterances = []
        with open(path, 'r') as f:
            for line in f:
                utterance, dialogue_id, dialogue_partner = line.rstrip().split('\t')
                utterances.append({
                    'utterance': utterance,
                    'dialogue_id': dialogue_id,
                    'dialogue_partner': dialogue_partner
                })
        
        return utterances

    def get_encoded_word(self, word) -> int:
        if word in self.vocab:
            return self.vocab[word]
        else:
            return self.vocab[UNKNOWN_TOKEN]
    
    def get_encoded_pos(self, pos) -> int:
        if pos in self.pos_vocab:
            return self.pos_vocab[pos]
        else:
            return self.pos_vocab[UNKNOWN_TOKEN]

    def get_encoded_nls(self, nls) -> int:
        return self.nls_vocab[nls]
    
    def get_nls_by_encoding(self, encoding) -> int:
        mirrored_nls_vocab = {encoding: nls for nls, encoding in self.nls_vocab.items()}
        return mirrored_nls_vocab[encoding]

    def __getitem__(self, item) -> dict:
        return self.samples[item]

    def __len__(self) -> int:
        return len(self.samples)    

In [34]:
def padding_collate(data):
    dialogue_ids = []
    dialogue_partners = []
    utterances = []
    tokenized_utterances = []
    tokenized_utterance_without_nls = []
    pos_tags = []
    glove = []
    pos = []
    source = []
    nls = []
    for sample in data:
        dialogue_ids.append(sample['dialogue_id'])
        dialogue_partners.append(sample['dialogue_id'])
        utterances.append(sample['utterance'])
        tokenized_utterances.append(sample['tokenized_utterance'])
        tokenized_utterance_without_nls.append(sample['tokenized_utterance_without_nls'])
        pos_tags.append(sample['pos_tags'])
        glove.append(sample['glove'])
        pos.append(sample['pos'])
        source.append(sample['source'])
        nls.append(sample['nls'])
        
    return {
        'dialogue_ids': dialogue_ids,
        'dialogue_partners': dialogue_partners,
        'utterances': utterances,
        'tokenized_utterances': tokenized_utterances,
        'tokenized_utterance_without_nls': tokenized_utterance_without_nls,
        'pos_tags': pos_tags,
        'glove': pad_sequence(glove, batch_first=True, padding_value=glove_vectors.stoi[PADDING_TOKEN]),
        'pos': pad_sequence(pos, batch_first=True),
        'source': pad_sequence(source, batch_first=True),
        'nls': pad_sequence(nls, batch_first=True)
    }
    

In [35]:
def dataloader(path_train, path_test, batch_size):
    train_dataset = NLSDataset(path_train, min_length_utterance=3)
    test_dataset = NLSDataset(path_test, min_length_utterance=3, dataset=train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=padding_collate)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 collate_fn=padding_collate)

    return train_dataloader, test_dataloader

In [36]:
train_dataloader, test_dataloader = dataloader(os.path.join(DATA_DIR, 'train_all.tsv'), os.path.join(DATA_DIR, 'test_all.tsv'), hyperparameters['batch_size'])

In [37]:
len(train_dataloader.dataset)

49780

In [38]:
train_dataloader.dataset.get_encoded_word(UNKNOWN_TOKEN)

19784

In [39]:
train_dataloader.dataset[33]

{'dialogue_id': '3724',
 'dialogue_partner': 'A',
 'utterance': 'and were those good um considering the the size of that kind of organization did you feel like that you had good benefits',
 'tokenized_utterance': ['<SOS>',
  'and',
  'were',
  'those',
  'good',
  'um',
  'considering',
  'the',
  'the',
  'size',
  'of',
  'that',
  'kind',
  'of',
  'organization',
  'did',
  'you',
  'feel',
  'like',
  'that',
  'you',
  'had',
  'good',
  'benefits',
  '<EOS>'],
 'tokenized_utterance_without_nls': ['<SOS>',
  'and',
  'were',
  'those',
  'good',
  'considering',
  'the',
  'the',
  'size',
  'of',
  'that',
  'kind',
  'of',
  'organization',
  'did',
  'you',
  'feel',
  'like',
  'that',
  'you',
  'had',
  'good',
  'benefits',
  '<EOS>'],
 'pos_tags': ['<SOS>',
  'CC',
  'VBD',
  'DT',
  'JJ',
  'VBG',
  'DT',
  'DT',
  'NN',
  'IN',
  'DT',
  'NN',
  'IN',
  'NN',
  'VBD',
  'PRP',
  'VB',
  'IN',
  'DT',
  'PRP',
  'VBD',
  'JJ',
  'NNS',
  '<EOS>'],
 'glove': tensor([40000

# 3 - Model

In [78]:
class NLSModel(nn.Module):
    def __init__(self, vocab_size, pos_vocab_size, nls_vocab_size, embedding_dim, lstm_out_dim, padding_idx, dropout_prob, glove_vectors=None):
        super(NLSModel, self).__init__()

        if glove_vectors != None:
            embedding_dim = glove_vectors.dim
            self.word_embeddings = nn.Embedding.from_pretrained(glove_vectors.vectors, freeze=True, padding_idx=glove_vectors.stoi[PADDING_TOKEN])
        else:
            self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)

        self.word_lstm = nn.LSTM(embedding_dim, lstm_out_dim, batch_first=True)

        self.pos_embeddings = nn.Embedding(pos_vocab_size, embedding_dim, padding_idx=padding_idx)
        self.pos_lstm = nn.LSTM(embedding_dim, lstm_out_dim, batch_first=True)

        self.transformer_linear = nn.Linear(embedding_dim, 512)
        self.transformer = nn.TransformerEncoderLayer(d_model=512, nhead=8)

        self.classifier = nn.Sequential(
            nn.Linear(lstm_out_dim * 2 + 512, lstm_out_dim),
            nn.Dropout(dropout_prob),
            nn.Tanh(),
            nn.Linear(lstm_out_dim, int(lstm_out_dim/2)),
            nn.Dropout(dropout_prob),
            nn.Tanh(),
            nn.Linear(int(lstm_out_dim/2), nls_vocab_size),
            # nn.LogSoftmax(dim=2)
            nn.Softmax(dim=2)
        )
        
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, source_sentence, source_pos):
        word_embedding = self.word_embeddings(source_sentence)
        word_dropped_out = self.dropout(word_embedding)
        word_output, _ = self.word_lstm(word_dropped_out)

        pos_embedding = self.pos_embeddings(source_pos)
        pos_dropped_out = self.dropout(pos_embedding)
        pos_output, _ = self.pos_lstm(pos_dropped_out)

        transformed_linear = self.transformer_linear(word_embedding)
        transformed = self.transformer(transformed_linear)

        # predictions = self.classifier(torch.add(word_output, pos_output, alpha=1))
        predictions = self.classifier(torch.cat((word_output, pos_output, transformed), dim=2))
        # predictions = self.classifier(transformed)
        return predictions

# 4 - Training

In [79]:

# loss_function = NLLLoss()
loss_function = CrossEntropyLoss(ignore_index=train_dataloader.dataset.get_encoded_nls(PADDING_TOKEN))
nls_model = NLSModel(len(train_dataloader.dataset.vocab),
                     len(train_dataloader.dataset.pos_vocab),
                     len(train_dataloader.dataset.nls_vocab),
                     hyperparameters['embedding_dim'],
                     hyperparameters['lstm_out_dim'],
                     train_dataloader.dataset.get_encoded_word(PADDING_TOKEN),
                     hyperparameters['dropout_prob'],
                     glove_vectors=glove_vectors)
nls_model.to(device)

optimizer = optim.Adam(nls_model.parameters(), lr=hyperparameters['learning_rate'])

In [80]:
def predict_nls(source_sentence):
    predicted = nls_model(source_sentence['glove'].to(device).unsqueeze(0), 
                          source_sentence['pos'].to(device).unsqueeze(0))
    argmax = torch.argmax(predicted.squeeze(0), dim=1)
    decoded = [train_dataloader.dataset.get_nls_by_encoding(int(encoding)) for encoding in argmax]

    target_sentence = []
    for index, word in enumerate(source_sentence['tokenized_utterance_without_nls']):
        target_sentence.append(word)
        if decoded[index] != NO_NLS_TOKEN:
            target_sentence.append(decoded[index])
            
    return target_sentence

In [81]:
source_sentence = train_dataloader.dataset[3]
print('source:     ', source_sentence['tokenized_utterance_without_nls'])
print('source_gold:', source_sentence['tokenized_utterance'])
print('predicted:  ', predict_nls(source_sentence))

source:      ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
source_gold: ['<SOS>', 'well', 'i', 'think', 'that', 'uh', '<EOS>']
predicted:   ['<SOS>', 'uh', 'well', 'ooh', 'i', 'ooh', 'think', 'ooh', 'that', 'uh-huh', '<EOS>', 'uh-oh']


In [82]:
print(f'{hyperparameters["epochs"]} EPOCHS - {math.floor(len(train_dataloader.dataset) / train_dataloader.batch_size)} BATCHES PER EPOCH')

for epoch in range(hyperparameters['epochs']):
    total_loss = 0

    if epoch == hyperparameters['glove_training_epoch']:
            nls_model.word_embeddings.weight.requires_grad = True

    for i, batch in enumerate(train_dataloader):
        source = batch['glove'].to(device)
        pos = batch['pos'].to(device)
        nls = batch['nls'].to(device)

        output = nls_model(source, pos)

        loss = loss_function(output.permute(0, 2, 1), nls)
        total_loss += loss.item()

        # print average loss for the epoch
        sys.stdout.write(f'\repoch {epoch}, batch {i}: {np.round(total_loss / (i + 1), 4)}')

        # compute gradients
        loss.backward()

        # update parameters
        optimizer.step()

        # reset gradients
        optimizer.zero_grad()
    print()
    print('source:     ', source_sentence['tokenized_utterance_without_nls'])
    print('source_gold:', source_sentence['tokenized_utterance'])
    print('predicted:  ', predict_nls(source_sentence))


20 EPOCHS - 777 BATCHES PER EPOCH
epoch 0, batch 777: 1.8788
source:      ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
source_gold: ['<SOS>', 'well', 'i', 'think', 'that', 'uh', '<EOS>']
predicted:   ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
epoch 1, batch 777: 1.8766
source:      ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
source_gold: ['<SOS>', 'well', 'i', 'think', 'that', 'uh', '<EOS>']
predicted:   ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
epoch 2, batch 777: 1.8766
source:      ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
source_gold: ['<SOS>', 'well', 'i', 'think', 'that', 'uh', '<EOS>']
predicted:   ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
epoch 3, batch 777: 1.8766
source:      ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
source_gold: ['<SOS>', 'well', 'i', 'think', 'that', 'uh', '<EOS>']
predicted:   ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
epoch 4, batch 777: 1.8766
source:      ['<SOS>', 'well', 'i', 'think', 'that', '<EOS>']
s

KeyboardInterrupt: 

# 5 - Testing

In [None]:
nls_model.eval()

NLSModel(
  (word_embeddings): Embedding(400004, 300, padding_idx=400002)
  (word_lstm): LSTM(300, 350, batch_first=True)
  (pos_embeddings): Embedding(44, 300, padding_idx=0)
  (pos_lstm): LSTM(300, 350, batch_first=True)
  (transformer_linear): Linear(in_features=300, out_features=512, bias=True)
  (transformer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (classifier): Sequential(
    (0): Linear(in_features=1212, out_features=350, bias=True)
    (1): Dropou

In [None]:
gold_nls = []
predicted_nls = []

for batch in test_dataloader:
    source = batch['glove'].to(device)
    pos = batch['pos'].to(device)

    output = nls_model(source, pos)
    argmax = torch.argmax(output, dim=2)
    
    predicted_nls.extend(argmax.tolist())
    gold_nls.extend(batch['nls'].tolist())
    

In [None]:
print(gold_nls[0])
print(predicted_nls[0])

[4, 4, 14, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 14, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
def decode(sequences):
    return [[test_dataloader.dataset.get_nls_by_encoding(encoding) for encoding in sequence] for sequence in sequences]

In [None]:
def remove_padding(gold, predicted):
    unpadded_gold = [[token for token in sample if token != PADDING_TOKEN] for sample in gold]
    unpadded_predicted = [sample[:len(unpadded_gold[index])] for index, sample in enumerate(predicted)]

    return unpadded_gold, unpadded_predicted

unpadded_gold_nls, unpadded_predicted_nls = remove_padding(decode(gold_nls), decode(predicted_nls))

In [None]:
print(unpadded_gold_nls[0])
print(unpadded_predicted_nls[0])

['<NO-NLS>', '<NO-NLS>', 'uh', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', 'uh', '<NO-NLS>']
['<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>']


In [None]:
print('Proportion of utterances, containing NLS:')
print('gold:     ', sum([any(nls in sequence for nls in NLS) for sequence in unpadded_gold_nls]) / len(unpadded_gold_nls))
print('predicted:', sum([any(nls in sequence for nls in NLS) for sequence in unpadded_predicted_nls]) / len(unpadded_predicted_nls))

Proportion of utterances, containing NLS:
gold:      1.0
predicted: 0.2291274838498651


In [None]:
accuracies = []
precisions = []
recalls = []
f1s = []
for index, sample in enumerate(unpadded_gold_nls):
    accuracies.append(accuracy_score(sample, unpadded_predicted_nls[index]))
    precisions.append(precision_score(sample, unpadded_predicted_nls[index], average='macro', zero_division=0))
    recalls.append(recall_score(sample, unpadded_predicted_nls[index], average='macro', zero_division=0))
    f1s.append(f1_score(sample, unpadded_predicted_nls[index], average='macro', zero_division=0))

print('Average accuracy:', sum(accuracies) / len(accuracies))
print('Average precision:', sum(precisions) / len(precisions))
print('Average recall:', sum(recalls) / len(recalls))
print('Average f1:', sum(f1s) / len(f1s))


Average accuracy: 0.9066916740877065
Average precision: 0.4477906353017172
Average recall: 0.48073326093188734
Average f1: 0.46210365485961524


In [None]:
def generate_nls_sentence(sentence):
    tokenized = [START_TOKEN, *nltk.word_tokenize(sentence), END_TOKEN]
    tagged = nltk.pos_tag(tokenized[1:-1])
    pos_tagged = [START_TOKEN, *[tag[1] for tag in tagged], END_TOKEN]

    print(pos_tagged)
    pos_encoded = torch.Tensor([train_dataloader.dataset.get_encoded_pos(pos) for pos in pos_tagged]).type(torch.LongTensor)
    glove_encoded = torch.Tensor([glove_vectors.stoi[word] if word in glove_vocab else glove_vectors.stoi[UNKNOWN_TOKEN] for word in tokenized]).type(torch.LongTensor)
    word_encoded = torch.Tensor([train_dataloader.dataset.get_encoded_word(word) for word in tokenized]).type(torch.LongTensor)
    
    return predict_nls({
        'pos': pos_encoded,
        'glove': glove_encoded,
        'source': word_encoded,
        'tokenized_utterance': [],
        'tokenized_utterance_without_nls': tokenized
    })

sentence = 'well thank you very much bye-bye'
print(generate_nls_sentence(sentence))
    

['<SOS>', 'RB', 'IN', 'PRP', 'RB', 'JJ', 'NN', '<EOS>']
['<SOS>', 'well', 'thank', 'you', 'very', 'much', 'bye-bye', '<EOS>']


In [None]:
[sequence for sequence in unpadded_predicted_nls if any(nls in sequence for nls in NLS)]

[['<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  'uh',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>'],
 ['<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  'uh',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '<NO-NLS>',
  '