In [None]:
import pandas as pd
import numpy as np

In [1]:
df_train = pd.read_csv('e2e-dataset/trainset.csv', encoding='latin-1')
df_test = pd.read_csv('e2e-dataset/devset.csv')

NameError: name 'pd' is not defined

In [3]:
df_train.head()

Unnamed: 0,mr,ref
0,"name[The Vaults], eatType[pub], priceRange[mor...",The Vaults pub near Café Adriatic has a 5 star...
1,"name[The Cambridge Blue], eatType[pub], food[E...","Close to Café Brazil, The Cambridge Blue pub s..."
2,"name[The Eagle], eatType[coffee shop], food[Ja...",The Eagle is a low rated coffee shop near Burg...
3,"name[The Mill], eatType[coffee shop], food[Fre...",Located near The Sorrento is a French Theme ea...
4,"name[Loch Fyne], food[French], customer rating...","For luxurious French food, the Loch Fyne is lo..."


In [4]:
df_train.iloc[0].mr

'name[The Vaults], eatType[pub], priceRange[more than £30], customer rating[5 out of 5], near[Café Adriatic]'

In [5]:
df_train.iloc[0].ref

'The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30.'

- name[The Eagle]
- eatType[coffee shop]
- food[French]
- priceRange[moderate]
- customerRating[3/5]
- area[riverside]
- kidsFriendly[yes]
- near[Burger King]

# Preprocessing the input

In [6]:
d = {}

types = ['name', 'eatType', 'food', 'priceRange', 'customer rating', 'area', 'kidsFriendly', 'near']
for s in df_train.mr:
    comps = s.split(',')
    for c in comps:
        for t in types:
            c = c.strip()
            if c.startswith(t):
                if t not in d:
                    d[t] = set()
                
                val = c[len(t)+1:].replace(']', '')
                d[t].add(val)

In [7]:
d

{'area': {'city centre', 'riverside'},
 'customer rating': {'1 out of 5',
  '3 out of 5',
  '5 out of 5',
  'average',
  'high',
  'low'},
 'eatType': {'coffee shop', 'pub', 'restaurant'},
 'food': {'Chinese',
  'English',
  'Fast food',
  'French',
  'Indian',
  'Italian',
  'Japanese'},
 'name': {'Alimentum',
  'Aromi',
  'Bibimbap House',
  'Blue Spice',
  'Browns Cambridge',
  'Clowns',
  'Cocum',
  'Cotto',
  'Fitzbillies',
  'Giraffe',
  'Green Man',
  'Loch Fyne',
  'Midsummer House',
  'Strada',
  'Taste of Cambridge',
  'The Cambridge Blue',
  'The Cricketers',
  'The Dumpling Tree',
  'The Eagle',
  'The Golden Curry',
  'The Golden Palace',
  'The Mill',
  'The Olive Grove',
  'The Phoenix',
  'The Plough',
  'The Punter',
  'The Rice Boat',
  'The Twenty Two',
  'The Vaults',
  'The Waterman',
  'The Wrestlers',
  'Travellers Rest Beefeater',
  'Wildwood',
  'Zizzi'},
 'near': {'All Bar One',
  'Avalon',
  'Burger King',
  'Café Adriatic',
  'Café Brazil',
  'Café Rouge',
 

In [8]:
# Creates a mapping that converts the mr type to an Id for the feature vector
type2id = {'name':0, 'near':1}
i = 2
for k, v in d.items():
    if k not in ['name', 'near']:
        for a in v:
            type2id[(k,a)] = i
            i += 1

In [9]:
def process_mr(s):
    mr = []
    
    types = ['name', 'eatType', 'food', 'priceRange', 'customer rating', 'area', 'kidsFriendly', 'near']
    comps = s.split(',')
    for c in comps:
        for t in types:
            c = c.strip()
            if c.startswith(t):
                val = c[len(t)+1:].replace(']', '')
                mr.append((t, val))
    return mr

In [10]:
type2id

{('area', 'city centre'): 5,
 ('area', 'riverside'): 6,
 ('customer rating', '1 out of 5'): 17,
 ('customer rating', '3 out of 5'): 15,
 ('customer rating', '5 out of 5'): 16,
 ('customer rating', 'average'): 18,
 ('customer rating', 'high'): 13,
 ('customer rating', 'low'): 14,
 ('eatType', 'coffee shop'): 4,
 ('eatType', 'pub'): 2,
 ('eatType', 'restaurant'): 3,
 ('food', 'Chinese'): 19,
 ('food', 'English'): 25,
 ('food', 'Fast food'): 20,
 ('food', 'French'): 22,
 ('food', 'Indian'): 23,
 ('food', 'Italian'): 21,
 ('food', 'Japanese'): 24,
 ('priceRange', 'cheap'): 11,
 ('priceRange', 'high'): 7,
 ('priceRange', 'less than £20'): 8,
 ('priceRange', 'moderate'): 9,
 ('priceRange', 'more than £30'): 12,
 ('priceRange', '£20-25'): 10,
 'name': 0,
 'near': 1}

In [11]:
processed_mrs_train = [process_mr(s) for s in df_train.mr]

In [12]:
def to_feature_vector(mrs):
    vec = np.zeros(len(type2id))
    for k,v in mrs:
        if k in ['name', 'near']:
            vec[type2id[k]] = 1
        else:
            vec[type2id[(k,v)]] = 1
    
    return vec

In [13]:
X_train = np.array([to_feature_vector(x) for x in processed_mrs_train])

In [14]:
X_train.shape

(42061, 26)

In [15]:
X_train

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 1.],
       [1., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.]])

# Preprocessing the output

In [16]:
df_train.head().ref.values

array(['The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30.',
       'Close to Café Brazil, The Cambridge Blue pub serves delicious Tuscan Beef for the cheap price of £10.50. Delicious Pub food.',
       'The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.',
       'Located near The Sorrento is a French Theme eatery and coffee shop called The Mill, with a price range at £20-£25 it is in the riverside area.',
       'For luxurious French food, the Loch Fyne is located by the river next to The Rice Boat.'],
      dtype=object)

In [None]:
# Since tokens are naively split on whitespaces, there can still be periods and comma's in tokens
def process_token

In [73]:
def tokenize(mr, s):
    for k,v in mr:
        if k == 'name':
            s = s.replace(v, 'SLOT_NAME')
        elif k == 'near':
            s = s.replace(v, 'SLOT_NEAR')
    
    result = ['<bos>']
    
    tokens = s.split(' ')
    for t in tokens:
        if t == '':
            continue
            
        t = t.strip()
        if t.startswith('.'):
            result.append('.')
            t = t[1:].lower()
            
        if t.startswith(','):
            result.append(',')
            t = t[1:].lower()
        
        # If there's a period in the 'token' add it individually
        append_period = False
        if t.endswith('.'):
            append_period = True            
            t = t[:len(t)-1].lower()
            
        # If there's a comma in the 'token' add it individually
        append_comma = False
        if t.endswith(','):
            append_comma = True            
            t = t[:len(t)-1].lower()
            
        append_period = False
        if t.endswith('.'):
            append_period = True            
            t = t[:len(t)-1].lower()
        
        # Check if there's a comma
        if ',' in t:
            cms = t.split(',')
            for i in range(len(cms)-1):
                result.append(cms[i].lower())
            t = cms[-1]
                    
        result.append(t.lower())
        
        if append_period == True:
            result.append('.')
        
        if append_comma == True:
            result.append(',')
            
    result.append('<eos>')
    return result

In [74]:
tokenized_sents = [tokenize(processed_mrs_train[i], df_train.iloc[i].ref) for i in range(len(df_train))]

In [105]:
# Produce a vocabulary
vocab = ['<unk>']
for sent in tokenized_sents:
    for t in sent:
        if t not in vocab:
            vocab.append(t)

In [107]:
# Word 2 vocabulary id mapping
word2id = dict(zip(vocab, range(len(vocab))))

In [114]:
def sent_to_vec(s):
    res = []
    # t = token in s = sentence
    for t in s:
        res.append(word2id[t])
    return np.array(res)

In [115]:
X_train = np.array([sent_to_vec(s) for s in tokenized_sents])

In [116]:
X_train

array([array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
       array([ 1, 16, 17,  5, 18,  2,  3, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       20,  3, 29, 15]),
       array([ 1,  2, 30,  7, 31, 32, 33, 34,  4,  5, 35, 24, 36, 37, 30, 38, 39,
       35, 30, 40, 41, 42, 23, 43, 29, 15]),
       ..., array([  1,   2,  30,  72,  95,  61,  49,  74, 124,  15]),
       array([  1,   2,  30,  72, 165,  33,  34,   4,   5,  35,  24,  57,  52,
        30,  79,  35,  19, 410,  15]),
       array([ 1,  2, 30, 72, 65, 61, 53, 24, 80, 81, 15])], dtype=object)

In [None]:
import sys
import os
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, RepeatVector, Dense, Activation, Input, Flatten, Reshape, Permute, Lambda
from keras.layers.merge import multiply, concatenate
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.callbacks import ModelCheckpoint
#import seq2seq
#from seq2seq.models import AttentionSeq2Seq

import data_loader
import postprocessing


def main():
    path_to_data_dir = 'data/'
    path_to_embeddings_dir = 'embeddings/'

    use_pretrained_embeddings = True        # set to True to use a pre-trained word embedding model
    split_mrs = True                        # set to True to split the test MRs before predicting
    postprocess = True                      # set to False to skip the utterance post-processing
    max_input_seq_len = 30                  # number of words the MRs should be truncated/padded to
    max_output_seq_len = 50                 # number of words the utterances should be truncated/padded to
    vocab_size = 10000                      # maximum vocabulary size of the utterances
    num_variations = 3                      # number of MR permutations to consider for re-ranking
    depth_enc = 1                           # number of LSTM layers in the encoder
    depth_dec = 1                           # number of LSTM layers in the decoder
    hidden_layer_size = 500                 # number of neurons in a single LSTM layer


    # ---- WORD EMBEDDING ----
    print('\nLoading embedding model...')
    embedding_model = data_loader.load_embedding_model(path_to_data_dir, path_to_embeddings_dir, use_pretrained_embeddings)
    weights = embedding_model.syn0

    # DEBUG PRINT
    print('weights.shape =', weights.shape)
    #print(embedding_model.similarity('pizza', 'hamburger'))
    #print(embedding_model.similarity('pizza', 'furniture'))
    #print(embedding_model.similarity('coffee', 'tea'))


    # ---- LOAD DATA ----
    print('\nLoading data...')
    #word2idx, idx2word = data_loader.load_vocab(path_to_vocab)
    x_train, y_train, x_test, y_test, original_mrs, original_sents, test_groups, y_idx2word = \
            data_loader.load_data(path_to_data_dir, embedding_model, vocab_size, max_input_seq_len, max_output_seq_len, num_variations, split_mrs)

    # x_test, y_test = permute_input(original_mrs, original_sents)

    # DEBUG PRINT
    print('Utterance vocab size:', len(y_idx2word))
    print('x_train.shape =', x_train.shape)
    print('y_train.shape =', y_train.shape)
    print('x_test.shape =', x_test.shape)
    print('y_test.shape =', y_test.shape)


    # ---- BUILD THE MODEL ----
    print('\nBuilding language generation model...')
    #model = Sequential()

    #ret_seq_first_layer = False
    #if depth_enc > 1:
    #    ret_seq_first_layer = True

    ## -- ENCODER --
    ##model.add(Embedding(input_dim=weights.shape[0],
    ##                    output_dim=weights.shape[1],
    ##                    weights=[weights],
    ##                    input_length=max_seq_len,       # can be omitted to process sequences of heterogenous length
    ##                    trainable=False))
    #model.add(Bidirectional(LSTM(units=weights.shape[1],
    #                             dropout=0.2,
    #                             recurrent_dropout=0.2,
    #                             return_sequences=ret_seq_first_layer),
    #                        input_shape=(max_input_seq_len, weights.shape[1])))
    #if depth_enc > 2:
    #    for d in range(depth_enc - 2):
    #        model.add(Bidirectional(LSTM(units=weights.shape[1],
    #                                        dropout=0.2,
    #                                        recurrent_dropout=0.2,
    #                                     return_sequences=True)))
    #if depth_enc > 1:
    #    model.add(Bidirectional(LSTM(units=weights.shape[1],
    #                                    dropout=0.2,
    #                                    recurrent_dropout=0.2,
    #                                 return_sequences=False)))

    ## -- DECODER --
    #model.add(RepeatVector(max_output_seq_len))
    #for d in range(depth_dec):
    #    model.add(LSTM(units=weights.shape[1],
    #                   dropout=0.2,
    #                   recurrent_dropout=0.2,
    #                   return_sequences=True))
    #model.add(TimeDistributed(Dense(len(y_idx2word),
    #                                activation='softmax')))


    # ---- ATTENTION MODEL ----

    input = Input(shape=(max_input_seq_len, weights.shape[1]))

    # -- ENCODER --
    encoder = Bidirectional(LSTM(units=hidden_layer_size,
                                 dropout=0.2,
                                 recurrent_dropout=0.2,
                                 return_sequences=True),
                            merge_mode='concat')(input)

    # -- ATTENTION --
    flattened = Flatten()(encoder)

    attention = []
    for i in range(max_output_seq_len):
        weighted = Dense(max_input_seq_len, activation='softmax')(flattened)
        unfolded = Permute([2, 1])(RepeatVector(hidden_layer_size * 2)(weighted))
        multiplied = multiply([encoder, unfolded])
        summed = Lambda(lambda x: K.sum(x, axis=-2))(multiplied)
        attention.append(Reshape((1, hidden_layer_size * 2))(summed))

    attention_out = concatenate(attention, axis=-2)

    # -- DECODER --
    decoder = LSTM(units=hidden_layer_size,
                   dropout=0.2,
                   recurrent_dropout=0.2,
                   return_sequences=True)(attention_out)

    decoder = Dense(len(y_idx2word),
                    activation='softmax')(decoder)

    model = Model(inputs=input, outputs=decoder)


    # ---- Keras Seq2Seq attention model [https://github.com/farizrahman4u/seq2seq] (not working) ----
    #model = AttentionSeq2Seq(input_dim=weights.shape[1],
    #                         input_length=max_input_seq_len,
    #                         hidden_dim=hidden_layer_size,
    #                         output_length=max_output_seq_len,
    #                         output_dim=len(y_idx2word),
    #                         depth=1)


    # ---- COMPILE ----
    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['accuracy'])

    model.summary()

    # -- Define Checkpoint--
    #filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
    filepath = 'trained_model.hdf5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]

    
    # ---- TRAIN ----
    print('\nTraining...')
    model.fit(x_train, y_train,
              batch_size=64,
              epochs=20,
              callbacks=callbacks_list)
    
    
    # ---- TEST ----
    #print('\nTesting...')
    #score, acc = model.evaluate(x_test, y_test)

    #print()
    #print('-> Test score:', score)
    #print('-> Test accuracy:', acc)


    # ---- PREDICT ----
    print('\nPredicting...')

    # -- SINGLE PREDICTION --
    #prediction_distr = model.predict(np.array([x_test[123]]))       # test MR: name[The Rice Boat], food[Japanese], area[city centre]
    #prediction = np.argmax(prediction_distr, axis=2)                # note: prediction_distr is a 3D array even for a single input to model.predict()
    #utterance = [y_idx2word[idx] for idx in prediction[0] if idx > 0]
    #print(' '.join(utterance))

    # -- BATCH PREDICTION --
    results = []
    prediction_distr = model.predict(np.array(x_test))
    predictions = np.argmax(prediction_distr, axis=2)

    for i, prediction in enumerate(predictions):
        utterance = ' '.join([y_idx2word[idx] for idx in prediction if idx > 0])
        results.append(utterance)

    # print(len(original_mrs))
    # print(len(results))
    print("Predictions have been processed. Now we are depermuting them: ")
    # x, y, p = postprocessing.depermute_input(original_mrs, original_sents, results, num_variations)
    # correct_preds = postprocessing.correct(x, p)
    # print(len(original_mrs))
    # print(len(results))
    if split_mrs:
        results_merged = postprocessing.merge_utterances(results, original_mrs, test_groups, num_variations)
    else:
        results_merged = []
        for i, prediction in enumerate(results):
            results_merged.append(postprocessing.relex_utterance(prediction, original_mrs[i]))

    #todo add this
    # if not split_mrs:
    #     utterance = postprocessing.relex_utterance(utterance, original_mrs[i])

    np.savetxt('results/results_raw.txt', list(results_merged), fmt='%s')
    # print('\n'.join(results_merged))


    # ---- POST-PROCESS ----
    if postprocess:
        print("Predictions have been processed. Now we are depermuting them: ")
        x, y, p = postprocessing.depermute_input(original_mrs, original_sents, results_merged, num_variations)
        print("Depermution is done, files written.")
        print("Writing depermute file.")
        cp = postprocessing.combo_print(p, results_merged, num_variations)
        correct_preds = postprocessing.correct(x, p)

        # for pp in p:
        #     print(pp)
        np.savetxt('results/results_pooling.txt', list(p), fmt='%s')
        np.savetxt('results/results_combo_pool.txt', list(cp), fmt='%s')
        np.savetxt('results/results_pooling_corrected.txt', list(correct_preds), fmt='%s')


if __name__ == "__main__":
    sys.exit(int(main() or 0))

    # t = "The Golden Currey serves Fast food food near near"
    # y = "The Golden Currey is rated a 3 3 of a of 5 5 5 5"
    # x = "The Golden Currey is near near the city centre"
    # blah = [t, y, x]
    # mrs = [0,0,0]
    # # t = "The Golden Currey is a family near near"
    # # t = score_grammar_spelling(t, True)
    # tool = language_check.LanguageTool('en-US')
    # for g in blah:
    #     print(score_grammar_spelling(False, g, tool))
    #     print(score_known_errors(g))
    # print(correct(mrs, blah))

In [117]:
import os
import json
import pickle
import copy
import random
import re
import pandas as pd
import numpy as np
from nltk import FreqDist
from gensim.models import Word2Vec, KeyedVectors
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

import embedding

# TODO: rewrite into object-oriented

def load_embedding_model(path_to_data_dir, path_to_embeddings_dir, use_pretrained_embeddings):
    path_to_training = path_to_data_dir + 'trainset_perm_3_slot_mr.csv'
    path_to_test = path_to_data_dir + 'devset_3_slot_mr.csv'
    path_to_embeddings = path_to_embeddings_dir + 'embeddings.npy'
    path_to_vocab = path_to_embeddings_dir + 'vocab.json'
    path_to_model = path_to_embeddings_dir + 'embedding_model.bin'
    #path_to_pretrained_model = path_to_embeddings_dir + 'GoogleNews-vectors-negative300.bin'
    path_to_pretrained_model = path_to_embeddings_dir + 'glove.6B.300d.txt'


    if use_pretrained_embeddings:
        # load Google's word2vec pre-trained word embedding model
        #return KeyedVectors.load_word2vec_format(path_to_pretrained_model, binary=True)

        # load Stanford's GloVe pre-trained word embedding model
        return KeyedVectors.load_word2vec_format(path_to_pretrained_model, binary=False)
    else:
        # train custom embedding model, if necessary
        if (os.path.isdir(path_to_embeddings_dir) == False or os.path.isfile(path_to_embeddings) == False):
            embedding.create_embeddings([path_to_training, path_to_test],
                                        path_to_embeddings,
                                        path_to_vocab,
                                        path_to_model,
                                        size=100,
                                        min_count=2,
                                        window=5,
                                        iter=1)

        # load our trained word2vec model
        return KeyedVectors.load_word2vec_format(path_to_model, binary=False)


def load_data(path_to_data_dir, embedding_model, vocab_size, max_input_seq_len, max_output_seq_len, num_variations, split_mrs):
    path_to_training = path_to_data_dir + 'trainset.csv'
    #path_to_training = path_to_data_dir + 'trainset_perm_3_slot_mr.csv'
    path_to_test = path_to_data_dir + 'devset.csv'
    #path_to_test = path_to_data_dir + 'devset_3_slot_mr.csv'
    #path_to_data_embed = path_to_data_dir + 'data_embed.pkl'
    

    # store/load the data in the embedded form
    #if os.path.isfile(path_to_data_embed) == False:
    #    x_train, y_train, x_test, y_test = preprocess_data(path_to_training, path_to_test, embedding_model, max_seq_len)
    #    with open(path_to_data_embed, 'wb') as f:
    #        pickle.dump([x_train, y_train, x_test, y_test], f)
    #else:
    #    with open(path_to_data_embed, 'rb') as f:
    #        x_train, y_train, x_test, y_test = pickle.load(f)

    #return x_train, y_train, x_test, y_test

    return preprocess_data(path_to_training, path_to_test, embedding_model, vocab_size, max_input_seq_len, max_output_seq_len, num_variations, split_mrs)


def preprocess_data(path_to_training_data, path_to_test_data, embedding, vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs):
    # read the training data from file
    data_frame_train = pd.read_csv(path_to_training_data, header=0, encoding='latin1')  # names=['mr', 'ref']
    x_train = data_frame_train.mr.tolist()
    y_train = data_frame_train.ref.tolist()

    # read the test data from file
    data_frame_test = pd.read_csv(path_to_test_data, header=0, encoding='latin1')       # names=['mr', 'ref']
    x_test = data_frame_test.mr.tolist()
    y_test = data_frame_test.ref.tolist()

    original_mrs = copy.deepcopy(x_test)
    original_sents = copy.deepcopy(y_test)

    if use_split_mrs:
        # split MRs into shorter ones
        x_test, y_test, test_groups = split_mrs(x_test, y_test, num_variations=num_variations)
    elif num_variations > 1:
        x_test, y_test = permute_input(x_test, y_test, num_permutes=num_variations)
        test_groups = []
    else:
        test_groups = []


    # parse the utterances into lists of words
    y_train = [preprocess_utterance(y) for y in y_train]
    y_test = [preprocess_utterance(y) for y in y_test]

    # create utterance vocabulary
    distr = FreqDist(np.concatenate(y_train + y_test))
    y_vocab = distr.most_common(min(len(distr), vocab_size))        # cap the vocabulary size
    y_idx2word = [word[0] for word in y_vocab]
    y_idx2word.insert(0, '-PADDING-')
    y_idx2word.extend(['&slot_val_name&', '&slot_val_food&', '&slot_val_near&'])
    y_idx2word.append('-PERIOD-')
    y_idx2word.append('-NA-')
    y_word2idx = {word: idx for idx, word in enumerate(y_idx2word)}

    delex_data(x_train, y_train, update_data_source=True)
    delex_data(x_test, y_test, update_data_source=True)
    

    padding_vec = np.zeros(embedding.syn0.shape[1])         # embedding vector for "padding" words

    # produce sequences of embedding vectors from the meaning representations (MRs) in the training set
    x_train_seq = []
    for mr in x_train:
        row_list = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot and convert to embedding
            slot = slot_value[:sep_idx].strip()
            row_list.extend([embedding[slot_word] for slot_word in slot.split() if slot_word in embedding.vocab])
            # parse the value and convert to embedding
            value = slot_value[sep_idx + 1:-1].strip()
            row_list.extend([embedding[value_word] for value_word in value.split() if value_word in embedding.vocab])
        # add padding
        row_list = add_padding(row_list, padding_vec, max_input_seq_len)

        x_train_seq.append(row_list)

    # produce sequences of one-hot vectors from the reference utterances in the training set
    y_train_seq = np.zeros((len(y_train), max_output_seq_len, len(y_word2idx)), dtype=np.int8)
    for i, utterance in enumerate(y_train):
        for j, word in enumerate(utterance):
            # truncate long utterances
            if j >= max_output_seq_len:
                break

            # represent each word with a one-hot vector
            if word == '.':
                y_train_seq[i][j][y_word2idx['-PERIOD-']] = 1
            elif word in y_word2idx:
                y_train_seq[i][j][y_word2idx[word]] = 1
            else:
                y_train_seq[i][j][y_word2idx['-NA-']] = 1

        # add padding for short utterances
        for j in range(len(utterance), max_output_seq_len):
            y_train_seq[i][j][y_word2idx['-PADDING-']] = 1

    # produce sequences of embedding vectors from the meaning representations (MRs) in the test set
    x_test_seq = []
    for mr in x_test:
        row_list = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot and convert to embedding
            slot = slot_value[:sep_idx].strip()
            row_list.extend([embedding[slot_word] for slot_word in slot.split() if slot_word in embedding.vocab])
            # parse the value and convert to embedding
            value = slot_value[sep_idx + 1:-1].strip()
            row_list.extend([embedding[value_word] for value_word in value.split() if value_word in embedding.vocab])
        # add padding
        row_list = add_padding(row_list, padding_vec, max_input_seq_len)

        x_test_seq.append(row_list)

    # produce sequences of one-hot vectors from the reference utterances in the test set
    y_test_seq = np.zeros((len(y_test), max_output_seq_len, len(y_word2idx)), dtype=np.int8)
    for i, utterance in enumerate(y_test):
        for j, word in enumerate(utterance):
            # truncate long utterances
            if j >= max_output_seq_len:
                break

            # represent each word with a one-hot vector
            if word in y_word2idx:
                y_test_seq[i][j][y_word2idx[word]] = 1
            else:
                y_test_seq[i][j][y_word2idx['-NA-']] = 1

        # add padding for short utterances
        for j in range(len(utterance), max_output_seq_len):
            y_test_seq[i][j][y_word2idx['-PADDING-']] = 1

    return (np.array(x_train_seq), np.array(y_train_seq), np.array(x_test_seq), np.array(y_test_seq), original_mrs, original_sents, test_groups, y_idx2word)


def permute_input(mrs, sents, num_permutes):
    new_mr = []
    new_sent = []
    for x, mr in enumerate(mrs):
        sentence = sents[x]
        temp = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            slot = slot_value[:sep_idx].strip()
            value = slot_value[sep_idx + 1:-1].strip()
            temp.append(slot + '[' + value + ']')
        for t in range(0, num_permutes):
            temptemp = copy.deepcopy(temp)
            random.shuffle(temptemp)
            curr_mr = ', '.join(temptemp)
            new_mr.append(curr_mr)
            new_sent.append(sentence)
    return new_mr, new_sent


def split_mrs(mrs, utterances, num_variations):
    new_mrs = []
    new_utterances = []
    groups = []
    group_id = 0

    for idx, mr in enumerate(mrs):
        utterance = utterances[idx]
        # do not split short MRs
        if len(mr) < 4:
            new_mrs.append(mr)
            new_utterances.append(utterance)
            continue

        slot_value_list = []
        name_slot = ()

        # parse the slot-value pairs
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            slot = slot_value[:sep_idx].strip()
            value = slot_value[sep_idx + 1:-1].strip()

            if slot == 'name':
                name_slot = (slot, value)
            else:
                slot_value_list.append((slot, value))

        for i in range(num_variations):
            slot_value_list_copy = slot_value_list[:]
            random.shuffle(slot_value_list_copy)

            # distribute the slot-value pairs as multiple shorter MRs
            while len(slot_value_list_copy) > 0:
                # include the name slot by default in each subset
                mr_subset = [name_slot]
                # add up to two other slots to the subset
                for i in range(min(2, len(slot_value_list_copy))):
                    mr_subset.append(slot_value_list_copy.pop())
            
                new_mr = [s + '[' + v + ']' for s, v in mr_subset]
                new_mrs.append(', '.join(new_mr))
                new_utterances.append(utterance)
                groups.append(group_id)
            
            group_id += 1

    return new_mrs, new_utterances, groups


def preprocess_utterance(utterance, keep_periods=False):
    if keep_periods:
        chars_to_filter = '!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n'
    
        # add spaces before periods so they can be parsed as individual words
        utterance = utterance.replace('. ', ' . ')
        if utterance[-1] == '.':
            utterance = utterance[:-1] + ' ' + utterance[-1]

        return text_to_word_sequence(utterance, filters=chars_to_filter)
    else:
        chars_to_filter = '.!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n'

        return text_to_word_sequence(utterance, filters=chars_to_filter)


def delex_data(mrs, sentences, update_data_source=False, specific_slots=None, split=True):
    if specific_slots is not None:
        delex_slots = specific_slots
    else:
        delex_slots = ['name', 'food', 'near']

    for x, mr in enumerate(mrs):
        if split:
            sentence = ' '.join(sentences[x])
        else:
            sentence = sentences[x].lower()
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot
            slot = slot_value[:sep_idx].strip()
            if slot in delex_slots:
                value = slot_value[sep_idx + 1:-1].strip()
                sentence = sentence.replace(value.lower(), '&slot_val_{0}&'.format(slot))
                mr = mr.replace(value, '&slot_val_{0}&'.format(slot))
                # if not split:
                #     print("delex:")
                #     print('&slot_val_{0}&'.format(slot))
                #     print(value.lower())
                #     print(sentence)
        if update_data_source:
            if split:
                sentences[x] = sentence.split()
            else:
                sentences[x] = sentence
            mrs[x] = mr
        if not split:
            return sentence
        # new_sent = relex_sentences(mr, sentence)


def add_padding(seq, padding_vec, max_seq_len):
    diff = max_seq_len - len(seq)
    if diff > 0:
        # pad short sequences
        return seq + [padding_vec for i in range(diff)]
    else:
        # truncate long sequences
        return seq[:max_seq_len]


def load_vocab(path_to_vocab):
    with open(path_to_vocab, 'r') as f_vocab:
        data = json.loads(f_vocab.read())

    word2idx = data
    idx2word = {v: k for k, v in data.items()}

    return word2idx, idx2word

ImportError: No module named 'gensim'