In [1]:
import pandas as pd
import numpy as np
import json, re

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
print(torch.__version__)

## Embeddings
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from gensim.models import KeyedVectors

## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

0.4.1
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rsilvei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Importing SNIPS intent dataset

In [2]:
dataset = pd.DataFrame(columns = ['phrase', 'intent'])

In [3]:
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open("./2017-06-custom-intent-engines/" + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Intent: {}, Length: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'phrase': text, 'intent': intent}, ignore_index=True)

Intent: AddToPlaylist, Length: 300
Intent: BookRestaurant, Length: 300
Intent: GetWeather, Length: 300
Intent: PlayMusic, Length: 300
Intent: RateBook, Length: 300
Intent: SearchCreativeWork, Length: 300
Intent: SearchScreeningEvent, Length: 300


In [4]:
dataset.intent.unique()

array(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
       'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'],
      dtype=object)

In [5]:
def transformText(text, do_stop=False, do_stem=False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

def strip_punctuation(s):
    return ''.join(c for c in s if c not in PUNCT)

## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [6]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x))

In [7]:
dataset.tail(10)

Unnamed: 0,phrase,intent,preproc_text
2090,I want to see a list of the closest cinema's m...,SearchScreeningEvent,i want to see a list of the closest cinema movies
2091,What Are the showings for The Natural History ...,SearchScreeningEvent,what are the showings for the natural history ...
2092,Give me the schedule for Public Stenographer a...,SearchScreeningEvent,give me the schedule for public stenographer a...
2093,Is it possible to see Tube at the closest movi...,SearchScreeningEvent,is it possible to see tube at the closest movi...
2094,I want to see Wenn Lucy springt now at a movie...,SearchScreeningEvent,i want to see wenn lucy springt now at a movie...
2095,Is Across the Line playing at the closest movi...,SearchScreeningEvent,is across the line playing at the closest movi...
2096,Which animated movies are playing in the neigh...,SearchScreeningEvent,which animated movies are playing in the neigh...
2097,Where is They Always Return at Dawn playing,SearchScreeningEvent,where is they always return at dawn playing
2098,What is the movie schedule in the neighborhood,SearchScreeningEvent,what is the movie schedule in the neighborhood
2099,Tell me when Howling II: Your Sister Is a Were...,SearchScreeningEvent,tell me when howling ii your sister is a werew...


In [8]:
!ls ../../../vectors/

GoogleNews-vectors-negative300.bin
crawl-300d-2M.vec
elmo_2x1024_128_2048cnn_1xhighway_options.json
elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5
elmo_2x2048_256_2048cnn_1xhighway_options.json
elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5
elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json
elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5
elmo_2x4096_512_2048cnn_2xhighway_options.json
elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5
glove.42B.300d.txt
glove.840B.300d.txt
lid.176.ftz
wiki-news-300d-1M-subword.txt
wiki-news-300d-1M.txt


## Build Vocabulary

In [9]:
## Build word vocabulary
word_to_ix = {}
for sent in dataset.preproc_text:
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print("Tamanho do dicionario: {}".format(len(word_to_ix)))

Tamanho do dicionario: 3440


In [10]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)

In [11]:
label_to_ix

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 3,
 'RateBook': 4,
 'SearchCreativeWork': 5,
 'SearchScreeningEvent': 6}

## Preparing PyTorch Dataset Loader

In [12]:
class Intents(Dataset):
    def __init__(self, dataframe, w2v_weights_path):
        self.len = len(dataframe)
        self.label_to_ix = {}
        self.data = dataframe
        self.w2v = w2v = KeyedVectors.load_word2vec_format(w2v_weights_path, binary = True)
        
    def __getitem__(self, index):
        phrase = self.data.preproc_text[index]
        X, _  = self.get_avg_sentence_vector(phrase)
        y = label_to_ix[self.data.intent[index]]
        #X.requires_grad = False
        #y.requires_grad = False
        #blin = X.detach()
        return X, y
    
    def __len__(self):
        return self.len

    def get_avg_sentence_vector(self, sentence):
        featureVec = np.zeros((self.w2v.vector_size), dtype="float32")
        nwords = 0
        not_found_words = []
        for word in sentence.split():
            if word in self.w2v.index2word:
                nwords = nwords+1
                featureVec = np.add(featureVec, self.w2v.get_vector(word))
            else:
                not_found_words.append(word)
        if nwords>0:
            featureVec = np.divide(featureVec, nwords)
        return featureVec, not_found_words

In [13]:
# Set data locations for embeddings
elmo_config_key_path = '../../../vectors/elmo_2x4096_512_2048cnn_2xhighway_options.json'
elmo_weights_key_path = '../../../vectors/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
w2v_weights_path = '../../../vectors/GoogleNews-vectors-negative300.bin'

In [14]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [15]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (2100, 3)
TRAIN Dataset: (1680, 3)
TEST Dataset: (420, 3)


In [16]:
training_set = Intents(train_dataset,  w2v_weights_path)
testing_set = Intents(test_dataset, w2v_weights_path)

In [None]:
testing_set.__getitem__(3)

## Simple MLP Classifier

In [17]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0,
                        cudaEfficient=True):
        super(SimpleMLP, self).__init__()
        """
        PARAMETERS:
        -dropout:    dropout for MLP
        """
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        
        if cudaEfficient:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                ).cuda()
        else:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                )
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [18]:
INP_DIM = training_set.w2v.vector_size
NUM_LABELS = len(label_to_ix)
NHIDDEN = 512
DROPOUT = 0.3
model = SimpleMLP(inputdim = INP_DIM ,
          nhidden = NHIDDEN,
          nclasses = NUM_LABELS,
          dropout = DROPOUT, 
          cudaEfficient = False)

In [19]:
NUM_LABELS

7

## Training

In [20]:
# Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 1}

In [21]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [22]:
training_loader.batch_size

64

In [23]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [24]:
max_epochs = 20
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        ## Step 1 - Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        sent = Variable(sent)
        label = Variable(label)
        
        ## Step 2 - Run forward pass
        output = model.forward(sent)
        
        # Get predictions from the maximum value
        _, predicted = torch.max(output.data, 1)
        
        ## Step 3 - Compute loss
        loss = loss_function(output, label)
        loss.backward()
        
        ## Step 4 = Update parameters
        optimizer.step()
        
        if i%100 == 0:
            
            # Calculate Accuracy         
            correct = 0
            total = 0
            
            for sent, label in testing_loader:
                sent = Variable(sent)
                label = Variable(label)
                
                # Forward pass only to get logits/output
                output = model.forward(sent)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(output.data, 1)
                
                # Total number of labels
                total += label.size(0)

                # Total correct predictions
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.data[0], accuracy))

EPOCH -- 0




Iteration: 0. Loss: 1.9529027938842773. Accuracy: 20.714285714285715%
EPOCH -- 1
Iteration: 0. Loss: 1.3044142723083496. Accuracy: 77.61904761904762%
EPOCH -- 2
Iteration: 0. Loss: 0.576018750667572. Accuracy: 87.38095238095238%
EPOCH -- 3
Iteration: 0. Loss: 0.25380849838256836. Accuracy: 91.9047619047619%
EPOCH -- 4
Iteration: 0. Loss: 0.13997235894203186. Accuracy: 91.66666666666667%
EPOCH -- 5
Iteration: 0. Loss: 0.0978354662656784. Accuracy: 96.19047619047619%
EPOCH -- 6
Iteration: 0. Loss: 0.09859947860240936. Accuracy: 96.42857142857143%
EPOCH -- 7
Iteration: 0. Loss: 0.055040061473846436. Accuracy: 97.38095238095238%
EPOCH -- 8
Iteration: 0. Loss: 0.11213701218366623. Accuracy: 98.33333333333333%
EPOCH -- 9
Iteration: 0. Loss: 0.09071548283100128. Accuracy: 98.0952380952381%
EPOCH -- 10
Iteration: 0. Loss: 0.041426051408052444. Accuracy: 98.57142857142857%
EPOCH -- 11
Iteration: 0. Loss: 0.07044611126184464. Accuracy: 99.04761904761905%
EPOCH -- 12
Iteration: 0. Loss: 0.0412219

In [31]:
input_phrase = "i need to book a restaurant today"

In [32]:
def get_reply(phrase):
    inp, _ = training_set.get_avg_sentence_vector(phrase)
    inp = Variable(torch.Tensor(inp))
    output = model.forward(inp)

    # Get predictions from the maximum value
    _, predicted = torch.max(output.data, 0)
    pred_label=list(label_to_ix.keys())[list(label_to_ix.values()).index(predicted.item())]
    return pred_label

In [33]:
get_reply(input_phrase)

'BookRestaurant'

In [34]:
label_to_ix

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 3,
 'RateBook': 4,
 'SearchCreativeWork': 5,
 'SearchScreeningEvent': 6}