# ATIS Dataset Exploration

In [17]:
import json
import pandas as pd
import numpy as np
with open('raw_datasets/ATIS/train.json', 'r') as f:
    data = json.load(f)

In [20]:
data['rasa_nlu_data']['common_examples']

[{'text': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
  'intent': 'flight',
  'entities': [{'start': 19,
    'end': 25,
    'value': 'boston',
    'entity': 'fromloc.city_name'},
   {'start': 29, 'end': 35, 'value': '838 am', 'entity': 'depart_time.time'},
   {'start': 50, 'end': 56, 'value': 'denver', 'entity': 'toloc.city_name'},
   {'start': 60, 'end': 64, 'value': '1110', 'entity': 'arrive_time.time'},
   {'start': 72,
    'end': 79,
    'value': 'morning',
    'entity': 'arrive_time.period_of_day'}]},
 {'text': 'what flights are available from pittsburgh to baltimore on thursday morning',
  'intent': 'flight',
  'entities': [{'start': 32,
    'end': 42,
    'value': 'pittsburgh',
    'entity': 'fromloc.city_name'},
   {'start': 46, 'end': 55, 'value': 'baltimore', 'entity': 'toloc.city_name'},
   {'start': 59,
    'end': 67,
    'value': 'thursday',
    'entity': 'depart_date.day_name'},
   {'start': 68,
    'end': 75,
    'value': 'morning',

In [23]:
df = pd.DataFrame()
df["text"] = np.array([sample['text'] for sample in data['rasa_nlu_data']['common_examples']])
df["intent"] = np.array([sample['intent'] for sample in data['rasa_nlu_data']['common_examples']])
df["entities"] = np.array([sample['entities'] for sample in data['rasa_nlu_data']['common_examples']])

In [33]:
def sep(x):
    if x.find('+') != -1:
        return x.split('+')
    else:
        return x
df['intent'] = df['intent'].map(lambda x: sep(x))

In [37]:
multi_intents = [i for i in df['intent'] if type(i) == list]

## 1. Document Pooling embedding

In [39]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward])

2019-09-30 22:28:30,102 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmpjhq_2uj0


100%|██████████| 160000128/160000128 [00:11<00:00, 13743566.33B/s]

2019-09-30 22:28:42,453 copying /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmpjhq_2uj0 to cache at /Users/waynewu/.flair/embeddings/glove.gensim.vectors.npy





2019-09-30 22:28:43,039 removing temp file /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmpjhq_2uj0
2019-09-30 22:28:43,578 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmptwcn30ye


100%|██████████| 21494764/21494764 [00:06<00:00, 3345051.90B/s]

2019-09-30 22:28:50,648 copying /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmptwcn30ye to cache at /Users/waynewu/.flair/embeddings/glove.gensim





2019-09-30 22:28:50,696 removing temp file /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmptwcn30ye
2019-09-30 22:28:52,723 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt not found in cache, downloading to /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmpm5eka57p


100%|██████████| 73034624/73034624 [00:06<00:00, 10579432.53B/s]

2019-09-30 22:29:00,215 copying /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmpm5eka57p to cache at /Users/waynewu/.flair/embeddings/news-forward-0.4.1.pt





2019-09-30 22:29:00,369 removing temp file /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmpm5eka57p
2019-09-30 22:29:01,324 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt not found in cache, downloading to /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmp8ez4a8b3


100%|██████████| 73034575/73034575 [00:06<00:00, 10789968.03B/s]

2019-09-30 22:29:08,856 copying /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmp8ez4a8b3 to cache at /Users/waynewu/.flair/embeddings/news-backward-0.4.1.pt





2019-09-30 22:29:09,103 removing temp file /var/folders/79/8vg87rnd3rj6bg7rtfq66cvr0000gn/T/tmp8ez4a8b3


In [43]:
for i in range(len(df['intent'])):
    
    sentence = Sentence(df.loc[i]['intent'])
    
    # embed the sentence with our document embedding
    document_embeddings.embed(sentence)

    # now check out the embedded sentence.
    df.loc[i]['embeddings'] = sentence.get_embedding()

KeyboardInterrupt: 

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=40)
km.fit(BERT_corpus)
clusters = km.labels_.tolist()

## 2. RNN embedding

## 3. BERT embeddings

In [None]:
from bert_serving.client import BertClient
from sklearn.cluster import KMeans

bc = BertClient()
BERT_embedding = bc.encode(text_list)

km = KMeans(n_clusters=40)
km.fit(BERT_corpus)
clusters = km.labels_.tolist()