# 0 - Imports/Constants

In [1]:
import glob
import re
import random
import os
import math
import sys
from pprint import pprint

import numpy as np

import spacy
from spacy.lang.en import English

import torch
import torch.nn as nn
from torch import optim
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe

import nltk
from nltk.translate.meteor_score import single_meteor_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
DATA_DIR = 'data/'
SWB_DIR = os.path.join(DATA_DIR, 'swb_ms98_transcriptions/')

SILENCE = '<silence>'
NLS = [
    'ah',
    'eh', # pronouned 'eh'
    'eh', # pronouned 'ey'
    'hm',
    'huh',
    'huh-uh',
    'hum-um',
    'ooh',
    'uh',
    'uh-huh',
    'uh-hum',
    'uh-oh',
    'um',
    'um-hum',
]

PADDING_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNK>'
START_TOKEN = '<SOS>'
END_TOKEN = '<EOS>'
NO_NLS_TOKEN = '<NO-NLS>'

device = torch.device('cuda:0')

In [3]:
glove_vectors = GloVe(name='6B', dim=300)

In [4]:
glove_vectors.vectors = torch.cat((glove_vectors.vectors, torch.rand(4, 300)))

glove_vectors.itos.append(START_TOKEN)
glove_vectors.stoi[START_TOKEN] = 400000
glove_vectors.itos.append(END_TOKEN)
glove_vectors.stoi[END_TOKEN] = 400001
glove_vectors.itos.append(PADDING_TOKEN)
glove_vectors.stoi[PADDING_TOKEN] = 400002
glove_vectors.itos.append(UNKNOWN_TOKEN)
glove_vectors.stoi[UNKNOWN_TOKEN] = 400002

# for better performance
glove_vocab = set(glove_vectors.itos)
glove_vectors.vectors.size()

torch.Size([400004, 300])

In [5]:
hyperparameters = {
    'batch_size': 64,
    'embedding_dim': 512,
    'lstm_out_dim': 350,
    'dropout_prob': 0.2,
    'epochs': 20,
    'glove_training_epoch': 10,
    'learning_rate': 0.002
}

# 1 - Preparing Data

In [6]:
class UtteranceCollector():
    ANNOTATIONS = [
        #     r'\[silence\]', # may be also a sign of hesitation
        r'\[noise\]',
        r'\[laughter\]',
        r'\[vocalized-noise\]'
    ]

    def __init__(self, path, max_number_files=-1) -> None:
        # nlp = English()
        # nlp.add_pipe("sentencizer")

        self.utterances = []
        for file_index, filename in enumerate(glob.iglob(os.path.join(path + '**/*trans*'), recursive=True)):
            if file_index == max_number_files:
                break

            folders = filename.split('/')
            dialogue_id = folders[-2]
            dialogue_partner = folders[-1][folders[-1].find(dialogue_id) + len(dialogue_id)]

            with open(filename, 'r') as f:
                saved_utterances = []
                for line in f:
                    utterance = self.cleanse_utterance(line)
                    if utterance != SILENCE and utterance != '':
                        self.utterances.append({
                                'dialogue_id': dialogue_id,
                                'dialogue_partner': dialogue_partner,
                                'utterance': utterance
                            })
                    # if utterance == SILENCE and len(saved_utterances) != 0:
                    #     # doc = nlp(' '.join(saved_utterances))
                    #     # for sentence in doc.sents:
                    #     #     utterances.append({
                    #     #         'dialogue_id': dialogue_id,
                    #     #         'dialoge_partner': dialogue_partner,
                    #     #         'utterance': sentence.text
                    #     #     })
                    #     self.utterances.append({
                    #             'dialogue_id': dialogue_id,
                    #             'dialogue_partner': dialogue_partner,
                    #             'utterance': ' '.join(saved_utterances)
                    #         })
                    #     saved_utterances = []

                    # elif utterance != '' and utterance != SILENCE:
                    #     saved_utterances.append(utterance)

    def cleanse_utterance(self, utterance: str):
        utterance = utterance.rstrip().split(' ', maxsplit=3)[-1]

        # remove annotations
        utterance = re.sub(fr'({"|".join(self.ANNOTATIONS)})', '', utterance)

        # replace anomalous words.
        # E.g.: "... [bettle/better] ..." -> "... better ...".
        # Also prevent duplications: "... [bettle/better] better ..." -> "... better ..."
        utterance = re.sub(r"(^| )\[(.*?)\/(?P<replace>.*?)\]( (?P=replace))?( |$|-)", lambda x: f' {x.group(3)} ', utterance)

        # replace words containing laughter.
        # E.g.: "... [laughter-alone] ..." -> "... alone ..."
        utterance = re.sub(r"(^| )\[laughter-(.*?)\]( |$|-)", lambda x: f' {x.group(2)} ', utterance)

        # exclude too complicated annotations to replace automatically
        if utterance.find(' [') > -1:
            return ''
        
        # replace partial word pronounciations
        # E.g. "... pla[stic]- ..." -> "... plastic- ..."
        utterance = re.sub(r'\[silence\]', SILENCE, utterance)
        utterance = re.sub(r'(\[|\])', '', utterance)

        # remove duplicate blanks
        utterance = re.sub(r' +', ' ', utterance).strip()

        return utterance   


In [7]:
def contains_nls(utterance: str):
    return any(nls in utterance['utterance'] for nls in [*NLS, SILENCE])

def contains_repetition(utterance: str, ngram=1):
    split_utterance = utterance['utterance'].split(' ')
    # include partial word pronounciations
    split_utterance = [word.rstrip('-') for word in split_utterance]
    zipped = list(zip(*[split_utterance[i:] for i in range(ngram)]))
    return any(zipped[index] == zipped[index - ngram] for index in range(ngram, len(zipped)))

In [8]:
utterances = UtteranceCollector(SWB_DIR).utterances

print('Total:', len(utterances))

contain_nls = list(filter(lambda x: contains_nls(x), utterances))
print('contain nls:', len(contain_nls))

for repetitions in range(1, 10):
    contain_repetition = list(filter(lambda x: contains_repetition(x, repetitions), utterances))
    print(f'contain {repetitions}-gram repetitions:', len(contain_repetition))
    print('\t->', random.choice(contain_repetition)['utterance'])

print('Lengths:')
lengths = {}
for utterance in utterances:
    utterance_length = len(utterance['utterance'].split(' '))
    lengths.setdefault(utterance_length, []).append(utterance)
for length, utts in sorted(lengths.items()):
    contain_nls = list(filter(lambda x: contains_nls(x), utts))
    contain_repetition = list(filter(lambda x: contains_repetition(x), utts))
    print(f'{length}:', len(utts), len(contain_nls) + len(contain_repetition), len(contain_nls), len(contain_repetition))


Total: 247123
contain nls: 135287
contain 1-gram repetitions: 47199
	-> well tell me what what do you think of uh of the current trends of how other people spend time with their children and so forth
contain 2-gram repetitions: 9994
	-> you know because they don't they don't wanna send them to day care
contain 3-gram repetitions: 1823
	-> so well keep up the good work and i'm going to i'm going to
contain 4-gram repetitions: 428
	-> i guess in a perfect world in a perfect world uh but we aren't in a perfect world so i don't know
contain 5-gram repetitions: 100
	-> upper body strength the easier it is to play you you have to do a you have to do a whole lot of sit ups and uh
contain 6-gram repetitions: 21
	-> that uh that's a powerful force once you that's a powerful force once you learn something you you tend to wanna stick with it
contain 7-gram repetitions: 9
	-> oh gosh yeah how do you usually cook your deer how do you usually cook your deer
contain 8-gram repetitions: 5
	-> yeah i h

In [None]:
def split_data(source_path, target_path_train, target_path_test, number_files=-1, train_split=0.8):
    max_length_utterance = -1
    
    utterances = UtteranceCollector(source_path, number_files).utterances

    delimiter = int(len(utterances) * train_split)

    with open(target_path_train, 'w') as target_train:
        for utterance in utterances[:delimiter]:
            tokenized = nltk.word_tokenize(utterance['utterance'])
            max_length_utterance = max(max_length_utterance, len(tokenized))
            target_train.write(f"{utterance['utterance']}\t{utterance['dialogue_id']}\t{utterance['dialogue_partner']}\n")
    with open(target_path_test, 'w') as target_test:
        for utterance in utterances[delimiter:]:
            tokenized = nltk.word_tokenize(utterance['utterance'])
            max_length_utterance = max(max_length_utterance, len(tokenized))
            target_test.write(f"{utterance['utterance']}\t{utterance['dialogue_id']}\t{utterance['dialogue_partner']}\n")

    return max_length_utterance

In [None]:
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_20.tsv'), os.path.join(DATA_DIR, 'test_20.tsv'), number_files=20)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_100.tsv'), os.path.join(DATA_DIR, 'test_100.tsv'), number_files=100)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_500.tsv'), os.path.join(DATA_DIR, 'test_500.tsv'), number_files=500)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_1000.tsv'), os.path.join(DATA_DIR, 'test_1000.tsv'), number_files=1000)
split_data(SWB_DIR, os.path.join(DATA_DIR, 'train_all.tsv'), os.path.join(DATA_DIR, 'test_all.tsv'))

88

# 2 - Loading Data

In [9]:
class NLSDataset(Dataset):
    def __init__(self, path, min_length_utterance=1, dataset=None, exclude_no_nls=False) -> None:
        super().__init__()
        utterances = self._read_file(path)

        if dataset is None:
            source_vocab = {UNKNOWN_TOKEN, START_TOKEN, END_TOKEN}
            pos_vocab = {UNKNOWN_TOKEN, START_TOKEN, END_TOKEN}
            for utterance in utterances:
                tokenized = nltk.word_tokenize(utterance['utterance'])
                filtered = [word for word in tokenized if word not in NLS]
                pos_tagged = nltk.pos_tag(filtered)

                source_vocab.update(filtered)
                pos_vocab.update([tag[1] for tag in pos_tagged])

            # PADDING_TOKEN will have index 0
            self.vocab = {word: index for index, word in enumerate([PADDING_TOKEN, *list(source_vocab)])}
            self.nls_vocab = {nls: index for index, nls in enumerate([PADDING_TOKEN, *list({*NLS, NO_NLS_TOKEN})])}
            self.pos_vocab = {pos: index for index, pos in enumerate([PADDING_TOKEN, *list(pos_vocab)])}
        else:
            self.vocab = dataset.vocab
            self.nls_vocab = dataset.nls_vocab
            self.pos_vocab = dataset.pos_vocab

        self.samples = []
        for utterance in utterances:
            tokenized = nltk.word_tokenize(utterance['utterance'])

            if len(tokenized) >= min_length_utterance:
                if exclude_no_nls and not any(nls in tokenized for nls in NLS):
                    continue

                tokenized.insert(0, START_TOKEN)
                tokenized.append(END_TOKEN)

                source_utterance = []
                nls_predictions = []
                for word in tokenized:
                    if word not in NLS:
                        source_utterance.append(word)
                        nls_predictions.append(NO_NLS_TOKEN)
                    else:
                        nls_predictions[-1] = word

                pos_tagged = nltk.pos_tag(source_utterance[1:-1])
                pos_tags = [START_TOKEN, *[tag[1] for tag in pos_tagged], END_TOKEN]
                
                encoded_glove = [glove_vectors.stoi[word] if word in glove_vocab else glove_vectors.stoi[UNKNOWN_TOKEN] for word in source_utterance]
                encoded_pos = [self.get_encoded_pos(pos) for pos in pos_tags]
                encoded_source_utterance = [self.get_encoded_word(word) for word in source_utterance]
                encoded_nls_predictions = [self.get_encoded_nls(nls) for nls in nls_predictions]
                
                self.samples.append({
                    'dialogue_id': utterance['dialogue_id'],
                    'dialogue_partner': utterance['dialogue_partner'],
                    'utterance': utterance['utterance'],
                    'tokenized_utterance': tokenized,
                    'tokenized_utterance_without_nls': source_utterance,
                    'pos_tags': pos_tags,
                    'glove': torch.tensor(encoded_glove),
                    'pos': torch.tensor(encoded_pos),
                    'source': torch.tensor(encoded_source_utterance),
                    'nls': torch.tensor(encoded_nls_predictions),
                })

    def _read_file(self, path):
        utterances = []
        with open(path, 'r') as f:
            for line in f:
                utterance, dialogue_id, dialogue_partner = line.rstrip().split('\t')
                utterances.append({
                    'utterance': utterance,
                    'dialogue_id': dialogue_id,
                    'dialogue_partner': dialogue_partner
                })
        
        return utterances

    def get_encoded_word(self, word) -> int:
        if word in self.vocab:
            return self.vocab[word]
        else:
            return self.vocab[UNKNOWN_TOKEN]
    
    def get_encoded_pos(self, pos) -> int:
        if pos in self.pos_vocab:
            return self.pos_vocab[pos]
        else:
            return self.pos_vocab[UNKNOWN_TOKEN]

    def get_encoded_nls(self, nls) -> int:
        return self.nls_vocab[nls]
    
    def get_nls_by_encoding(self, encoding) -> int:
        mirrored_nls_vocab = {encoding: nls for nls, encoding in self.nls_vocab.items()}
        return mirrored_nls_vocab[encoding]

    def __getitem__(self, item) -> dict:
        return self.samples[item]

    def __len__(self) -> int:
        return len(self.samples)    

In [10]:
def padding_collate(data):
    dialogue_ids = []
    dialogue_partners = []
    utterances = []
    tokenized_utterances = []
    tokenized_utterance_without_nls = []
    pos_tags = []
    glove = []
    pos = []
    source = []
    nls = []
    for sample in data:
        dialogue_ids.append(sample['dialogue_id'])
        dialogue_partners.append(sample['dialogue_id'])
        utterances.append(sample['utterance'])
        tokenized_utterances.append(sample['tokenized_utterance'])
        tokenized_utterance_without_nls.append(sample['tokenized_utterance_without_nls'])
        pos_tags.append(sample['pos_tags'])
        glove.append(sample['glove'])
        pos.append(sample['pos'])
        source.append(sample['source'])
        nls.append(sample['nls'])
        
    return {
        'dialogue_ids': dialogue_ids,
        'dialogue_partners': dialogue_partners,
        'utterances': utterances,
        'tokenized_utterances': tokenized_utterances,
        'tokenized_utterance_without_nls': tokenized_utterance_without_nls,
        'pos_tags': pos_tags,
        'glove': pad_sequence(glove, batch_first=True, padding_value=glove_vectors.stoi[PADDING_TOKEN]),
        'pos': pad_sequence(pos, batch_first=True),
        'source': pad_sequence(source, batch_first=True),
        'nls': pad_sequence(nls, batch_first=True)
    }
    

In [11]:
def dataloader(path_train, path_test, batch_size):
    train_dataset = NLSDataset(path_train, min_length_utterance=3)
    test_dataset = NLSDataset(path_test, min_length_utterance=3, dataset=train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=padding_collate)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 collate_fn=padding_collate)

    return train_dataloader, test_dataloader

In [12]:
train_dataloader, test_dataloader = dataloader(os.path.join(DATA_DIR, 'train_all.tsv'), os.path.join(DATA_DIR, 'test_all.tsv'), hyperparameters['batch_size'])

In [13]:
len(train_dataloader.dataset)

133637

In [14]:
train_dataloader.dataset.get_encoded_word(UNKNOWN_TOKEN)

2474

In [15]:
train_dataloader.dataset[33]

{'dialogue_id': '3763',
 'dialogue_partner': 'B',
 'utterance': "that's contrary to uh popular belief you know",
 'tokenized_utterance': ['<SOS>',
  'that',
  "'s",
  'contrary',
  'to',
  'uh',
  'popular',
  'belief',
  'you',
  'know',
  '<EOS>'],
 'tokenized_utterance_without_nls': ['<SOS>',
  'that',
  "'s",
  'contrary',
  'to',
  'popular',
  'belief',
  'you',
  'know',
  '<EOS>'],
 'pos_tags': ['<SOS>',
  'DT',
  'VBZ',
  'JJ',
  'TO',
  'JJ',
  'NN',
  'PRP',
  'VBP',
  '<EOS>'],
 'glove': tensor([400000,     12,      9,   6605,      4,    814,   4440,     81,    346,
         400001]),
 'pos': tensor([30, 15, 13, 31, 28, 31,  8, 10, 16, 20]),
 'source': tensor([ 8293, 26544, 15213, 11300,  4002, 12038,  1624, 27103,  3121,  9559]),
 'nls': tensor([ 5,  5,  5,  5, 12,  5,  5,  5,  5,  5])}

# 3 - Model

In [52]:
class NLSModel(nn.Module):
    def __init__(self, vocab_size, pos_vocab_size, nls_vocab_size, embedding_dim, lstm_out_dim, padding_idx, dropout_prob, glove_vectors=None):
        super(NLSModel, self).__init__()

        if glove_vectors != None:
            embedding_dim = glove_vectors.dim
            self.word_embeddings = nn.Embedding.from_pretrained(glove_vectors.vectors, freeze=True, padding_idx=glove_vectors.stoi[PADDING_TOKEN])
        else:
            self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)

        self.word_lstm = nn.LSTM(embedding_dim, lstm_out_dim, batch_first=True)

        # self.pos_embeddings = nn.Embedding(pos_vocab_size, embedding_dim, padding_idx=padding_idx)
        # self.pos_lstm = nn.LSTM(embedding_dim, lstm_out_dim, batch_first=True)

        # self.transformer_linear = nn.Linear(embedding_dim, 512)
        # self.transformer = nn.TransformerEncoderLayer(d_model=512, nhead=8)

        self.classifier = nn.Sequential(
            # nn.Linear(lstm_out_dim * 2, lstm_out_dim),
            # nn.Dropout(dropout_prob),
            # nn.Tanh(),
            nn.Linear(lstm_out_dim, int(lstm_out_dim/2)),
            nn.Dropout(dropout_prob),
            nn.Tanh(),
            nn.Linear(int(lstm_out_dim/2), nls_vocab_size),
            nn.ReLU()
        )
        
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, source_sentence, source_pos):
        word_embedding = self.word_embeddings(source_sentence)
        word_dropped_out = self.dropout(word_embedding)
        word_output, _ = self.word_lstm(word_dropped_out)

        # pos_embedding = self.pos_embeddings(source_pos)
        # pos_dropped_out = self.dropout(pos_embedding)
        # pos_output, _ = self.pos_lstm(pos_dropped_out)

        # transformed_linear = self.transformer_linear(word_embedding)
        # transformed = self.transformer(transformed_linear)

        predictions = self.classifier(word_output)
        # predictions = self.classifier(torch.add(word_output, pos_output, alpha=0.5))
        # predictions = self.classifier(torch.cat((word_output, pos_output), dim=2))
        return predictions

# 4 - Training

In [53]:
loss_function = CrossEntropyLoss(ignore_index=train_dataloader.dataset.get_encoded_nls(PADDING_TOKEN))
nls_model = NLSModel(len(train_dataloader.dataset.vocab),
                     len(train_dataloader.dataset.pos_vocab),
                     len(train_dataloader.dataset.nls_vocab),
                     hyperparameters['embedding_dim'],
                     hyperparameters['lstm_out_dim'],
                     train_dataloader.dataset.get_encoded_word(PADDING_TOKEN),
                     hyperparameters['dropout_prob'],
                     glove_vectors=glove_vectors)
nls_model.to(device)

optimizer = optim.Adam(nls_model.parameters(), lr=hyperparameters['learning_rate'])

In [54]:
def predict_nls(words, pos, tokenized_without_nls):
    predicted = nls_model(words.to(device).unsqueeze(0), 
                          pos.to(device).unsqueeze(0))
    argmax = torch.argmax(predicted.squeeze(0), dim=1)
    decoded = [train_dataloader.dataset.get_nls_by_encoding(int(encoding)) for encoding in argmax]

    target_sentence = []
    for index, word in enumerate(tokenized_without_nls):
        target_sentence.append(word)
        if decoded[index] != NO_NLS_TOKEN:
            target_sentence.append(decoded[index])
            
    return target_sentence

In [55]:
source_sentence = train_dataloader.dataset[3]
print('source:     ', source_sentence['tokenized_utterance_without_nls'])
print('source_gold:', source_sentence['tokenized_utterance'])
print('predicted:  ', predict_nls(source_sentence['glove'], source_sentence['pos'], source_sentence['tokenized_utterance_without_nls']))

source:      ['<SOS>', 'but', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', '<EOS>']
source_gold: ['<SOS>', 'but', 'uh', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', 'uh', '<EOS>']
predicted:   ['<SOS>', 'uh-hum', 'but', 'since-', 'uh-oh', 'you', 'uh', 'know', 'uh', 'since', 'hum-um', 'i', 'uh', 'live', 'uh', 'here', 'um', 'so', 'uh', 'close', 'uh', 'to', 'uh', 'Arlington', 'uh', '<EOS>', 'uh-oh']


In [56]:
print(f'{hyperparameters["epochs"]} EPOCHS - {math.floor(len(train_dataloader.dataset) / train_dataloader.batch_size)} BATCHES PER EPOCH')

for epoch in range(hyperparameters['epochs']):
    total_loss = 0

    if epoch == hyperparameters['glove_training_epoch']:
            nls_model.word_embeddings.weight.requires_grad = True

    for i, batch in enumerate(train_dataloader):
        source = batch['glove'].to(device)
        pos = batch['pos'].to(device)
        nls = batch['nls'].to(device)

        output = nls_model(source, pos)

        loss = loss_function(output.permute(0, 2, 1), nls)
        total_loss += loss.item()

        # print average loss for the epoch
        sys.stdout.write(f'\repoch {epoch}, batch {i}: {np.round(total_loss / (i + 1), 4)}')

        # compute gradients
        loss.backward()

        # update parameters
        optimizer.step()

        # reset gradients
        optimizer.zero_grad()
    print()
    print('source:     ', source_sentence['tokenized_utterance_without_nls'])
    print('source_gold:', source_sentence['tokenized_utterance'])
    print('predicted:  ', predict_nls(source_sentence['glove'], source_sentence['pos'], source_sentence['tokenized_utterance_without_nls']))

del source, pos, nls


20 EPOCHS - 2088 BATCHES PER EPOCH
epoch 0, batch 2088: 0.1769
source:      ['<SOS>', 'but', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', '<EOS>']
source_gold: ['<SOS>', 'but', 'uh', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', 'uh', '<EOS>']
predicted:   ['<SOS>', 'but', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', '<EOS>']
epoch 1, batch 2088: 0.1453
source:      ['<SOS>', 'but', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', '<EOS>']
source_gold: ['<SOS>', 'but', 'uh', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', 'uh', '<EOS>']
predicted:   ['<SOS>', 'but', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so', 'close', 'to', 'Arlington', '<EOS>']
epoch 2, batch 2088: 0.1443
source:      ['<SOS>', 'but', 'since-', 'you', 'know', 'since', 'i', 'live', 'here', 'so'

# 5 - Testing

In [57]:
nls_model.eval()

NLSModel(
  (word_embeddings): Embedding(400004, 300, padding_idx=400002)
  (word_lstm): LSTM(300, 350, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=350, out_features=175, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): Tanh()
    (3): Linear(in_features=175, out_features=15, bias=True)
    (4): ReLU()
  )
  (dropout): Dropout(p=0.2, inplace=False)
)

In [58]:
gold_nls = []
gold_sentences = []

predicted_nls = []
predicted_sentences = []

for batch in test_dataloader:
    with torch.no_grad():
        source = batch['glove'].to(device)
        pos = batch['pos'].to(device)

        output = nls_model(source, pos)
        argmax = torch.argmax(output, dim=2)
        
        predicted_nls.extend(argmax.tolist())
        
        
        predicted_sentences.extend([predict_nls(glove, batch['pos'][index], batch['tokenized_utterance_without_nls'][index]) for index, glove in enumerate(batch['glove'])])

        gold_nls.extend(batch['nls'].tolist())
        gold_sentences.extend(batch['tokenized_utterances'])

del source, pos    

In [59]:
print(gold_nls[0])
print(predicted_nls[0])
print(gold_sentences[0])
print(predicted_sentences[0])

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
['<SOS>', 'you', 'know', 'and', 'once', 'a', 'week', 'we', 'drive', 'up', 'into', 'the', 'mountains', 'usually', 'you', 'know', 'usually', 'once', 'a', 'week', 'once', 'every', 'other', 'week', '<EOS>']
['<SOS>', 'you', 'know', 'and', 'once', 'a', 'week', 'we', 'drive', 'up', 'into', 'the', 'mountains', 'usually', 'you', 'know', 'usually', 'once', 'a', 'week', 'once', 'every', 'other', 'week', '<EOS>']


In [60]:
def decode(sequences):
    return [[test_dataloader.dataset.get_nls_by_encoding(encoding) for encoding in sequence] for sequence in sequences]

In [61]:
def remove_padding(gold, predicted):
    unpadded_gold = [[token for token in sample if token != PADDING_TOKEN] for sample in gold]
    unpadded_predicted = [sample[:len(unpadded_gold[index])] for index, sample in enumerate(predicted)]

    return unpadded_gold, unpadded_predicted

unpadded_gold_nls, unpadded_predicted_nls = remove_padding(decode(gold_nls), decode(predicted_nls))

In [62]:
print(unpadded_gold_nls[0])
print(unpadded_predicted_nls[0])

['<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>']
['<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>', '<NO-NLS>']


In [63]:
print('Proportion of utterances, containing NLS:')
print('gold:     ', sum([any(nls in sequence for nls in NLS) for sequence in unpadded_gold_nls]) / len(unpadded_gold_nls))
print('predicted:', sum([any(nls in sequence for nls in NLS) for sequence in unpadded_predicted_nls]) / len(unpadded_predicted_nls))

Proportion of utterances, containing NLS:
gold:      0.36797761261396805
predicted: 0.0485962747871092


In [64]:
print('Proportion of NLS to <NO-NLS> in test set:')

number_nls = 0
number_NO_NLS = 0
for sequence in unpadded_gold_nls:
    for word in sequence:
        if word == NO_NLS_TOKEN:
            number_NO_NLS += 1
        else:
            number_nls += 1

nls_ratio = number_nls / number_NO_NLS

print('NLS:', number_nls)
print('<NO-NLS>:', number_NO_NLS)
print('Ratio:', round(nls_ratio, 4))

Proportion of NLS to <NO-NLS> in test set:
NLS: 17167
<NO-NLS>: 635201
Ratio: 0.027


In [65]:
accuracies = []
precisions = []
recalls = []
f1s = []
weighted_accuracies = []

for index, sample in enumerate(unpadded_gold_nls):
    predicted_sample = unpadded_predicted_nls[index]
    accuracies.append(accuracy_score(sample, predicted_sample))
    precisions.append(precision_score(sample, predicted_sample, average='macro', zero_division=0))
    recalls.append(recall_score(sample, predicted_sample, average='macro', zero_division=0))
    f1s.append(f1_score(sample, predicted_sample, average='macro', zero_division=0))
    
    weighted_accuracy = 0
    for word_index, word in enumerate(sample):
        if word == NO_NLS_TOKEN:
            weight = nls_ratio
        else:
            weight = 1 - nls_ratio
        
        weighted_accuracy += weight * (word == predicted_sample[word_index])
    weighted_accuracies.append(weighted_accuracy / len(sample))



print('Average accuracy:', sum(accuracies) / len(accuracies))
print('Average precision:', sum(precisions) / len(precisions))
print('Average recall:', sum(recalls) / len(recalls))
print('Average f1:', sum(f1s) / len(f1s))
print('Average weighted accuracy:', sum(weighted_accuracies) / len(weighted_accuracies))
print('NLS score:', ((sum(weighted_accuracies) / len(weighted_accuracies)) + (sum(accuracies) / len(accuracies))) / 2)


Average accuracy: 0.9665976223602545
Average precision: 0.7846176930902309
Average recall: 0.7981514587260463
Average f1: 0.7907261100505186
Average weighted accuracy: 0.02625035422924703
NLS score: 0.49642398829475076


In [66]:
meteor_scores = [single_meteor_score(gold_sentence, predicted_sentences[index]) for index, gold_sentence in enumerate(gold_sentences)]
print('Average meteor score:', sum(meteor_scores) / len(meteor_scores))

Average meteor score: 0.9719289290194459


In [67]:
def generate_nls_sentence(sentence):
    tokenized = [START_TOKEN, *nltk.word_tokenize(sentence), END_TOKEN]
    tagged = nltk.pos_tag(tokenized[1:-1])
    pos_tagged = [START_TOKEN, *[tag[1] for tag in tagged], END_TOKEN]

    pos_encoded = torch.Tensor([train_dataloader.dataset.get_encoded_pos(pos) for pos in pos_tagged]).type(torch.LongTensor)
    glove_encoded = torch.Tensor([glove_vectors.stoi[word] if word in glove_vocab else glove_vectors.stoi[UNKNOWN_TOKEN] for word in tokenized]).type(torch.LongTensor)
    word_encoded = torch.Tensor([train_dataloader.dataset.get_encoded_word(word) for word in tokenized]).type(torch.LongTensor)
    
    return predict_nls(glove_encoded, pos_encoded, tokenized)

sentence = 'well thank you very much bye-bye'
print(generate_nls_sentence(sentence))
    

['<SOS>', 'well', 'thank', 'you', 'very', 'much', 'bye-bye', '<EOS>']
