## ATIS data preprocessing

read the `*.iob` files from `JointSLU` [github repo](https://github.com/yvchen/JointSLU/blob/master/program/BasicModel.py)

following the `JointSLU` experiment, we train with `atis-2.train`, validate with `atis-2.dev` and test with `atis-test`

split into tokenized sentences, entity lists, and intent list

train a gensim w2v model on the training data

encode the data

In [1]:
from collections import Counter
from preprocessing import CharacterIndexer, SlotIndexer, IntentIndexer
from gensim.models import Word2Vec
import json
import numpy as np
import pandas as pd
import pickle

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


### read the file data

the files are already preprocessed (lower-cased and stripped of punctuation)

all we will do is number replacement

In [2]:
def readatis(filename='data/atis/atis-2.train.w-intent.iob'):
    """
    function for reading the ATIS 
    """
    data = pd.read_csv(filename, sep='\t', header=None)
    # get sentences and ner labels
    sents = [s.split() for s in data[0].tolist()]
    ners  = [s.split() for s in data[1].tolist()]
    # for sents, replace digits
    for i, sent in enumerate(sents):
        sent = ' '.join(sent)
        for d in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
            sent = sent.replace(d, '#')
        sents[i] = sent.split()
    # check lengths
    assert(len(sents)==len(ners))
    # the intent label is the last item of ners.
    # remove it and replace it with a 'O' null tag
    ints = [s[-1] for s in ners]
    ners = [s[:-1]+['O'] for s in ners]
    # check sent, ner, int lengths
    assert(len(sents)==len(ints))
    for i in range(len(sents)):
        assert(len(sents[i])==len(ners[i]))
    return sents, ners, ints

In [3]:
trn_texts, trn_slots, trn_ints = readatis('data/atis/atis-2.train.w-intent.iob')

In [4]:
dev_texts, dev_slots, dev_ints = readatis('data/atis/atis-2.dev.w-intent.iob')

In [5]:
tst_texts, tst_slots, tst_ints = readatis('data/atis/atis.test.w-intent.iob')

In [6]:
len(trn_texts), len(dev_texts), len(tst_texts)

(4478, 500, 893)

### reduce slot names

we can only consider the macro-level slot names

In [7]:
# def slotsplitter(slotslist):
#     newlist = []
#     for s in slotslist:
#         newsent = [i.split('.')[0] for i in s]
#         newlist.append(newsent)
#     return newlist

# trn_slots = slotsplitter(trn_slots)
# dev_slots = slotsplitter(dev_slots)
# tst_slots = slotsplitter(tst_slots)

### visual test of data

In [8]:
# view set of slot tags
len(list(set([t for s in trn_slots for t in s]))) # , list(set([t for s in trn_ners for t in s]))

120

In [9]:
Counter([t for s in trn_slots for t in s]).most_common(10)

[('O', 41022),
 ('B-toloc.city_name', 3919),
 ('B-fromloc.city_name', 3892),
 ('I-toloc.city_name', 987),
 ('B-depart_date.day_name', 785),
 ('B-airline_name', 639),
 ('I-fromloc.city_name', 632),
 ('B-depart_time.period_of_day', 521),
 ('I-airline_name', 379),
 ('B-depart_date.day_number', 355)]

In [10]:
# view set of intents
len(list(set(trn_ints))) #, list(set(trn_ints))

21

In [11]:
slens = [len(s) for s in trn_texts]
np.mean(slens)

13.276686020544886

In [12]:
swuts = [1 if l < 22 else 0 for l in slens]
sum(swuts)*100/len(swuts)

95.73470299240732

In [13]:
# the hashtagged elements seem to be for multi-label predictions
# we will treat them as a separate joint label for now
Counter(trn_ints).most_common(10)

[('atis_flight', 3309),
 ('atis_airfare', 385),
 ('atis_ground_service', 230),
 ('atis_airline', 139),
 ('atis_abbreviation', 130),
 ('atis_aircraft', 70),
 ('atis_flight_time', 45),
 ('atis_quantity', 41),
 ('atis_flight#atis_airfare', 19),
 ('atis_city', 18)]

In [14]:
for i in range(1, 6):
    print('txt:', len(trn_texts[-i]), trn_texts[-i])
    print('ent:', len(trn_slots[-i]), trn_slots[-i])
    print('int:', trn_ints[-i])
    print()

txt: 12 ['BOS', 'is', 'there', 'a', 'delta', 'flight', 'from', 'denver', 'to', 'san', 'francisco', 'EOS']
ent: 12 ['O', 'O', 'O', 'O', 'B-airline_name', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O']
int: atis_flight

txt: 14 ['BOS', "i'd", 'like', 'a', 'twa', 'flight', 'from', 'las', 'vegas', 'to', 'new', 'york', 'nonstop', 'EOS']
ent: 14 ['O', 'O', 'O', 'O', 'B-airline_code', 'O', 'O', 'B-fromloc.city_name', 'I-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'B-flight_stop', 'O']
int: atis_flight

txt: 12 ['BOS', 'tell', 'me', 'about', 'ground', 'transportation', 'between', 'orlando', 'international', 'and', 'orlando', 'EOS']
ent: 12 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.airport_name', 'I-fromloc.airport_name', 'O', 'B-toloc.city_name', 'O']
int: atis_ground_service

txt: 23 ['BOS', 'what', 'are', 'the', 'nonstop', 'flights', 'on', 'america', 'west', 'or', 'southwest', 'air', 'from', 'kansas', 'city', 'to', 'burbank', 

### train word2vec embeddings

using pretrained embeddings has been found to increase performance in various NLP tasks.

we could train on an external corpus such as brown but we will just use the training data

In [15]:
# first, remove the BOS and EOS tags from the training sentences
w2v_text = [s[1:-1] for s in trn_texts]

In [16]:
# train and save model
model = Word2Vec(w2v_text, size=200, min_count=1, window=5, workers=3, iter=5)
model.save('model/atis_w2v.gensimmodel')
print('training done!')

training done!


In [17]:
# get model vocabulary
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])

In [18]:
# test
model.wv.most_similar('delta')

[('airline', 0.9996054172515869),
 ('september', 0.9995239973068237),
 ('into', 0.9994678497314453),
 ('flying', 0.9994544982910156),
 ('eastern', 0.9994508028030396),
 ('has', 0.9994401931762695),
 ('cities', 0.9994279146194458),
 ('other', 0.999421238899231),
 ('stopping', 0.9994186758995056),
 ('out', 0.999413013458252)]

In [19]:
model.wv.most_similar('dallas')

[('for', 0.9986646175384521),
 ('economy', 0.9985247850418091),
 ('schedule', 0.9984636902809143),
 ('service', 0.9984104037284851),
 ('flying', 0.9983426332473755),
 ('december', 0.9983159899711609),
 ('only', 0.9983118772506714),
 ('times', 0.9983005523681641),
 ('via', 0.9982900619506836),
 ('any', 0.9982751607894897)]

## index the sentences

we will use premade classes for this that will integer-index and pad sentences to fixed lengths.

we will truncate max sentence and word lengths to mean + 2*st.dev

In [20]:
# instantiate a sentence indexer and fit to the training data
sentindexer = CharacterIndexer(max_sent_mode='std')
sentindexer.fit(trn_texts, verbose=True)

fit(): splitting...
fit(): max sent len set to 22
fit(): max word len set to 9
fit(): creating conversion dictionaries...
fit(): tru word vocab: 728
fit(): tru char vocab: 35
fit(): done!


In [21]:
# transform the sentence data
trn_text_idx, trn_char_idx = sentindexer.transform(trn_texts)
dev_text_idx, dev_char_idx = sentindexer.transform(dev_texts)
tst_text_idx, tst_char_idx = sentindexer.transform(tst_texts)
trn_text_idx.shape, dev_text_idx.shape, tst_text_idx.shape, trn_char_idx.shape

((4478, 22), (500, 22), (893, 22), (4478, 22, 9))

In [22]:
# instantiate a slot indexer and fit to the training data
slotindexer = SlotIndexer(max_len=sentindexer.max_sent_len)
slotindexer.fit(trn_slots, verbose=True)

fit(): labels set to size: 121


In [23]:
# transform the slot data
trn_slot_idx = slotindexer.transform(trn_slots)
dev_slot_idx = slotindexer.transform(dev_slots)
tst_slot_idx = slotindexer.transform(tst_slots)
trn_slot_idx.shape, dev_slot_idx.shape, tst_slot_idx.shape

((4478, 22, 1), (500, 22, 1), (893, 22, 1))

In [24]:
intindexer = IntentIndexer()
intindexer.fit(trn_ints, verbose=True)

fit(): labels set to size: 22


In [25]:
# transform the intent data
trn_int_idx = intindexer.transform(trn_ints)
dev_int_idx = intindexer.transform(dev_ints)
tst_int_idx = intindexer.transform(tst_ints)
trn_int_idx.shape, dev_int_idx.shape, tst_int_idx.shape

((4478, 1), (500, 1), (893, 1))

### test for NaNs, out-of-bounds indices

In [26]:
print(np.unique(np.isnan(trn_text_idx)))
print(np.unique(np.isnan(trn_char_idx)))
print(np.unique(np.isnan(dev_text_idx)))
print(np.unique(np.isnan(dev_char_idx)))
print(np.unique(np.isnan(tst_text_idx)))
print(np.unique(np.isnan(tst_char_idx)))

print(np.unique(np.isnan(trn_slot_idx)))
print(np.unique(np.isnan(dev_slot_idx)))
print(np.unique(np.isnan(tst_slot_idx)))

print(np.unique(np.isnan(trn_int_idx)))
print(np.unique(np.isnan(dev_int_idx)))
print(np.unique(np.isnan(tst_int_idx)))

[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]


In [27]:
print(sentindexer.max_word_vocab)
print(np.unique(np.max(trn_text_idx)))
print(np.unique(np.max(dev_text_idx)))
print(np.unique(np.max(tst_text_idx)))
print(sentindexer.max_char_vocab)
print(np.unique(np.max(trn_char_idx)))
print(np.unique(np.max(dev_char_idx)))
print(np.unique(np.max(tst_char_idx)))
print(slotindexer.labelsize)
print(np.unique(np.max(trn_slot_idx)))
print(np.unique(np.max(dev_slot_idx)))
print(np.unique(np.max(tst_slot_idx)))
print(intindexer.labelsize)
print(np.unique(np.max(trn_int_idx)))
print(np.unique(np.max(dev_int_idx)))
print(np.unique(np.max(tst_int_idx)))

728
[726]
[727]
[727]
35
[33]
[32]
[33]
121
[119]
[115]
[115]
22
[21]
[17]
[16]


### test decoding

In [28]:
sentindexer.inverse_transform(tst_text_idx[0:1])

[['BOS',
  'i',
  'would',
  'like',
  'to',
  'find',
  'a',
  'flight',
  'from',
  'charlotte',
  'to',
  'las',
  'vegas',
  'that',
  'makes',
  'a',
  'stop',
  'in',
  'st.',
  'louis',
  'EOS']]

In [29]:
slotindexer.inverse_transform(tst_slot_idx[0:1])

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-fromloc.city_name',
  'O',
  'B-toloc.city_name',
  'I-toloc.city_name',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-stoploc.city_name',
  'I-stoploc.city_name',
  'O']]

In [30]:
intindexer.inverse_transform(tst_int_idx[0:5])

['atis_flight', 'atis_airfare', 'atis_flight', 'atis_flight', 'atis_flight']

### save all items

In [31]:
# save transformers
pickle.dump(sentindexer, open('encoded/atis_sentindexer.pkl', 'wb'))
pickle.dump(slotindexer, open('encoded/atis_slotindexer.pkl', 'wb'))
pickle.dump(intindexer,  open('encoded/atis_intindexer.pkl',  'wb'))

# save word2vec vocab
pickle.dump(vocab, open('model/atis_w2v_vocab.pkl',  'wb'))

# save text data
pickle.dump(trn_texts, open('encoded/trn_texts_raw.pkl', 'wb'))
pickle.dump(dev_texts, open('encoded/dev_texts_raw.pkl', 'wb'))
pickle.dump(tst_texts, open('encoded/tst_texts_raw.pkl',  'wb'))

pickle.dump(trn_slots, open('encoded/trn_slots_raw.pkl', 'wb'))
pickle.dump(dev_slots, open('encoded/dev_slots_raw.pkl', 'wb'))
pickle.dump(tst_slots, open('encoded/tst_slots_raw.pkl',  'wb'))

pickle.dump(trn_ints, open('encoded/trn_ints_raw.pkl', 'wb'))
pickle.dump(dev_ints, open('encoded/dev_ints_raw.pkl', 'wb'))
pickle.dump(tst_ints, open('encoded/tst_ints_raw.pkl',  'wb'))

# save encoded data
np.save('encoded/trn_text_idx.npy', trn_text_idx)
np.save('encoded/dev_text_idx.npy', dev_text_idx)
np.save('encoded/tst_text_idx.npy', tst_text_idx)

np.save('encoded/trn_char_idx.npy', trn_char_idx)
np.save('encoded/dev_char_idx.npy', dev_char_idx)
np.save('encoded/tst_char_idx.npy', tst_char_idx)

np.save('encoded/trn_slot_idx.npy', trn_slot_idx)
np.save('encoded/dev_slot_idx.npy', dev_slot_idx)
np.save('encoded/tst_slot_idx.npy', tst_slot_idx)

np.save('encoded/trn_int_idx.npy', trn_int_idx)
np.save('encoded/dev_int_idx.npy', dev_int_idx)
np.save('encoded/tst_int_idx.npy', tst_int_idx)