<a href="https://colab.research.google.com/github/GabrielOlem/projetoPLN/blob/main/ATIS_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTS

In [None]:
import tensorflow as tf
import pickle
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, RepeatVector, TimeDistributed, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


## LOAD DATASET AND GET TOKENS, SLOTS AND INTENTS, AND THEIR INDEXES

In [None]:
def load_ds(fname):
    #fname = os.path.join('/content/drive/MyDrive/CIn/2021.1/PLN_projeto',fname)
    with open(fname, 'rb') as stream:
        ds, dicts = pickle.load(stream)
    print('Done  loading: ', fname)
    print('      samples: {:4d}'.format(len(ds['query'])))
    print('   vocab_size: {:4d}'.format(len(dicts['token_ids'])))
    print('   slot count: {:4d}'.format(len(dicts['slot_ids'])))
    print(' intent count: {:4d}'.format(len(dicts['intent_ids'])))
    return ds, dicts

In [None]:
train_ds, dicts = load_ds('atis.train.pkl')
test_ds, _ = load_ds('atis.test.pkl')

Done  loading:  atis.train.pkl
      samples: 4978
   vocab_size:  943
   slot count:  129
 intent count:   26
Done  loading:  atis.test.pkl
      samples:  893
   vocab_size:  943
   slot count:  129
 intent count:   26


In [None]:
t2i, s2i, in2i = map(dicts.get, ['token_ids', 'slot_ids', 'intent_ids'])
i2t, i2s, i2in = map(lambda d: {d[k]: k for k in d.keys()}, [t2i, s2i, in2i])
query, slots, intent = map(train_ds.get,
                           ['query', 'slot_labels', 'intent_labels'])

In [None]:
t2i_test, s2i_test, in2i_test = map(dicts.get, ['token_ids', 'slot_ids', 'intent_ids'])
i2t_test, i2s_test, i2in_test = map(lambda d: {d[k]: k for k in d.keys()}, [t2i_test, s2i_test, in2i_test])
query_test, slots_test, intent_test = map(test_ds.get,
                           ['query', 'slot_labels', 'intent_labels'])

## REMOVE WORDS FROM 'OTHER' SLOT CATEGORY

## GENERATE X AND Y PAIRING

In [None]:
X = query + query_test
y = slots + slots_test

x_text, y_all = [], []
for i, k in zip(X, y):
    tmp,tmpy = [], []
    for j, l in zip(i, k):
        if i2t[j] != "EOS" and i2t[j] != "BOS":
            tmp.append(i2t[j])
            tmpy.append(l)
    x_text.append(tmp)
    y_all.append(tmpy)
print(x_text[0])
print(y_all[0])


['i', 'want', 'to', 'fly', 'from', 'boston', 'at', '838', 'am', 'and', 'arrive', 'in', 'denver', 'at', '1110', 'in', 'the', 'morning']
[128, 128, 128, 128, 128, 48, 128, 35, 100, 128, 128, 128, 78, 128, 14, 128, 128, 12]


In [None]:
def new_vocab(X,y):
    all_text = " ".join([" ".join(x) for x in X])
    vocab = sorted(set(all_text))
    
    # create character/id and label/id mapping
    char2idx = {u:i+1 for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    
    return char2idx, y

In [None]:
c2i, l2i = new_vocab(x_text,y_all)

In [None]:
def split_char_labels(eg):
    '''
    For a given input/output example, break tokens into characters while keeping 
    the same label.
    '''
    tokens = eg[0]
    labels = eg[1]
    
    input_chars = []
    output_char_labels = []

    for token,label in zip(tokens,labels):
        input_chars.extend([char for char in token])
        input_chars.extend(' ')
        output_char_labels.extend([label]*len(token))
        output_char_labels.append(129)

    return [[c2i[x] for x in input_chars[:-1]],np.array([x for x in output_char_labels[:-1]])]

In [None]:
formatted = [split_char_labels(eg) for eg in zip(x_text, y_all)]

## HYPERPARAMETERS

In [None]:
BATCH_SIZE = 256
BUFFER_SIZE = 200
EMBEDDING_DIM = 128
UNITS = 128
EPOCHS=10
LABEL = len(dicts['slot_ids'])+1
VOCABULARY = len(dicts['token_ids'])+1
INPUT_LENGTH = 259
INPUT_DIM = len(x_text)+1
OUTPUT_DIM=64

## DATASET PADDING AND SHUFFLING

In [None]:
X = [x for x,y in formatted]
y = [y for x,y in formatted]
X = pad_sequences(X)
y = pad_sequences(y)
y = [to_categorical(i, num_classes=LABEL) for i in y]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## MODEL DEFINITION AND COMPILING

In [None]:
model = Sequential()
model.add(Embedding(input_dim=INPUT_DIM, output_dim=UNITS, input_length=INPUT_LENGTH))
model.add(Bidirectional(LSTM(units=UNITS, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
model.add(LSTM(units=UNITS, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model.add(TimeDistributed(Dense(LABEL, activation="relu")))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 259, 128)          751616    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 259, 256)          263168    
_________________________________________________________________
lstm_3 (LSTM)                (None, 259, 128)          197120    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 259, 130)          16770     
Total params: 1,228,674
Trainable params: 1,228,674
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## MODEL FITTING

In [None]:
hist = model.fit(X_train, np.array(y_train), batch_size=BATCH_SIZE, verbose=1, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
score = model.evaluate(X_test, np.array(y_test), verbose=1)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.6625925898551941 / Test accuracy: 0.8596960306167603
