In [1]:
#### Getting imports ####
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
### Getting data ####
def readatis(filename):
    """
    function for reading the ATIS 
    """
    data = pd.read_csv(filename, sep='\t', header=None)
    # get sentences and labels
    
    sents = [s.split() for s in data[0].tolist()]
    labels  = [s.split() for s in data[1].tolist()]
    
    # for sents, replace digits
    for i, sent in enumerate(sents):        
        sent = ' '.join(sent)        
        for d in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
            sent = sent.replace(d, '#')
        sents[i] = sent.split()
    # check lengths
    
    assert(len(sents)==len(labels))
    # the intent label is the last item of labels
    # remove it and replace it with a 'O' null tag 
    ints = [s[-1] for s in labels]
    labels = [s[:-1]+['O'] for s in labels]
    
    # check sent, labels, int lengths
    assert(len(sents)==len(ints))
    for i in range(len(sents)):
        assert(len(sents[i])==len(labels[i]))
    return sents, labels, ints

In [3]:
trn_texts, trn_slots, trn_ints = readatis('./atis-2.train.w-intent.iob.txt')


In [4]:
tst_texts, tst_slots, tst_ints = readatis('./atis.test.w-intent.iob.txt')


In [5]:
len(trn_texts),len(trn_slots),len(trn_ints)

(4478, 4478, 4478)

In [6]:
len(tst_texts)

893

**EDA**

In [48]:
# view set of slot tags
len(list(set([t for s in trn_slots for t in s]))) 

120

In [10]:
# most repeated tags
from collections import Counter
Counter([t for s in trn_slots for t in s]).most_common(10)

[('O', 41022),
 ('B-toloc.city_name', 3919),
 ('B-fromloc.city_name', 3892),
 ('I-toloc.city_name', 987),
 ('B-depart_date.day_name', 785),
 ('B-airline_name', 639),
 ('I-fromloc.city_name', 632),
 ('B-depart_time.period_of_day', 521),
 ('I-airline_name', 379),
 ('B-depart_date.day_number', 355)]

In [18]:
# unique number of intents 
len(set(list(trn_ints)))

21

In [20]:
# most common intents 
Counter(trn_ints).most_common(5)

[('atis_flight', 3309),
 ('atis_airfare', 385),
 ('atis_ground_service', 230),
 ('atis_airline', 139),
 ('atis_abbreviation', 130)]

In [26]:
# mapping of intent example  

for i in range(1, 4):
    print('text:    ', len(trn_texts[-i]), trn_texts[-i])
    print('encoding:', len(trn_slots[-i]), trn_slots[-i])
    print('intent:     ', trn_ints[-i])
    print()

text:     12 ['BOS', 'is', 'there', 'a', 'delta', 'flight', 'from', 'denver', 'to', 'san', 'francisco', 'EOS']
encoding: 12 ['O', 'O', 'O', 'O', 'B-airline_name', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O']
intent:      atis_flight

text:     14 ['BOS', "i'd", 'like', 'a', 'twa', 'flight', 'from', 'las', 'vegas', 'to', 'new', 'york', 'nonstop', 'EOS']
encoding: 14 ['O', 'O', 'O', 'O', 'B-airline_code', 'O', 'O', 'B-fromloc.city_name', 'I-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'B-flight_stop', 'O']
intent:      atis_flight

text:     12 ['BOS', 'tell', 'me', 'about', 'ground', 'transportation', 'between', 'orlando', 'international', 'and', 'orlando', 'EOS']
encoding: 12 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.airport_name', 'I-fromloc.airport_name', 'O', 'B-toloc.city_name', 'O']
intent:      atis_ground_service



In [28]:
! pip install nltk 



distributed 1.21.8 requires msgpack, which is not installed.
