In [1]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve,validation_curve
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
def load_data(file_name):
    data_df = pd.read_csv(file_name, sep=';')
    return data_df.dropna(axis=0, how='any', subset=['answer_normalised'])

## Let's take a quick look at our dataset

In [3]:
nlu_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')
number_of_intents = nlu_data_df['intent'].nunique()
list_of_intents = nlu_data_df['intent'].unique()
number_of_utterances = nlu_data_df['answer_normalised'].nunique()
print(f'From a total of {number_of_utterances} utterances, there are {number_of_intents} intents')
print(f'List of intents: {list_of_intents}')

From a total of 20685 utterances, there are 46 intents
List of intents: ['set' 'volume_mute' 'hue_lightchange' 'hue_lightoff' 'hue_lighton'
 'hue_lightdim' 'cleaning' 'query' 'music' 'quirky' 'greet' 'convert'
 'remove' 'likeness' 'hue_lightup' 'order' 'settings' 'volume_down' 'joke'
 'dislikeness' 'volume_other' 'coffee' 'volume_up' 'wemo_on' 'wemo_off'
 'stock' 'radio' 'post' 'locations' 'recipe' 'sendemail' 'factoid'
 'events' 'audiobook' 'podcasts' 'ticket' 'movies' 'game' 'traffic'
 'definition' 'querycontact' 'createoradd' 'addcontact' 'taxi' 'maths'
 'currency']


## For this example with Word2Vec (skip if you just want TFIDF), we will keep tokenization easy 
(keep in mind, other langauges might require more complex tokenization!)


In [None]:
def preprocess_lower(token):
    return token.lower()

def tokenize_utterances(dataframe):
    utterances = list(dataframe.answer_normalised.values)
    return [list(map(preprocess_lower, utterance.split(' '))) for utterance in utterances]

In [None]:
tokenized_utterances = tokenize_utterances(nlu_data_df)

## The target class labels (for the intents) require encoding to do machine learning stuff

In [4]:
le = preprocessing.LabelEncoder()
def encode_labels(target_class):
    label_encoded_y = le.fit_transform(list(target_class))
    return label_encoded_y

In [5]:
def decode_labels(label_encoded_y):
    return le.inverse_transform(label_encoded_y)


In [None]:
decode_labels(label_encoded_y)

In [6]:
intents = nlu_data_df.intent.values
label_encoded_y = encode_labels(intents)
label_encoded_y

array([34, 34, 34, ..., 27, 27, 27])

## We will try to use word2vec to create word vectors from the utterances for the classifiers
Open question: Is this the best word embedding system in terms of performance vs resource usage?

Reasons word2vec was choosen:
* implemented in several programming langauges
* it is well known
* isn't too resource intensive (i.e. it could run in real time on a phone)

However, it might not perform the best, bag of words methods might work better, as word order isn't super important for utterances of a voice assistant (question for the class: why?)

# Skip all this if you just want the best results, go to TFIDF


In [None]:
def create_word2vec_model(tokenized_utterances):
    model = Word2Vec(tokenized_utterances, vector_size=128, window=2, min_count=1, workers=4)
    return model

In [None]:
word2vec_model = create_word2vec_model(tokenized_utterances)

In [None]:
def convert_utterances_to_vectors(model, tokenized_utterances):
    #get the utterances average vector
    utterances_vectors = list()
    for utterance in tokenized_utterances:
        utterance_vector = [list(model.wv[token]) for token in utterance if token in model.wv.key_to_index.keys()]
        utterances_vectors.append(list(np.mean(utterance_vector, axis=0)))
    return utterances_vectors

In [None]:
utterances_vectors = convert_utterances_to_vectors(word2vec_model, tokenized_utterances)

## TFIDF
Question for class: Why does it score better?
# Skip this if you are checking out Word2Vec!

In [7]:
vectorizer = TfidfVectorizer()
utterances_vectors = vectorizer.fit_transform(nlu_data_df.answer_normalised.values)

# TODO: return_train_test_split requires len for word2vec and shape for TFIDF
# TODO: NB requires todense for TFIDF
# TODO: Add normal bag of words and compare all of them for all models(?)

## 'What is my purpose?'
## 'You split the data.'
*sad robot noises*

In [8]:
def return_train_test_split(utterances_vectors, label_encoded_y):
    x_train, x_test, y_train, y_test = train_test_split(utterances_vectors, label_encoded_y, train_size=0.8,test_size=0.2)
    # if Word2Vec use len
    # if TFIDF use shape
    try:
        number_of_training = x_train.shape[0]
        number_of_testing = x_test.shape[0]
    except:
        number_of_training = len(x_train)
        number_of_testing = len(x_test)
    # (TODO: Add parameter for switching?)
    print(f"Training set has {number_of_training} samples.")
    print(f"Testing set has {number_of_testing} samples.")
    return x_train, x_test, y_train, y_test

In [9]:
x_train, x_test, y_train, y_test = return_train_test_split(utterances_vectors, label_encoded_y)

Training set has 16588 samples.
Testing set has 4147 samples.


## Ohhhh, machine learning!

The classifiers are chosen because:
* Most of these algorithmns exist in other langauges
* They are pretty light (ie can run on a phone not just for inference but for TRAINING custom models!)
* Word order doesn't matter (bag of words style over here)

In [10]:
def train_classifier(classifier, x_train, y_train):
    return classifier.fit(x_train, y_train)

def test_classifier(classifier_model, x_test, y_test):
    y_prediction = classifier_model.predict(x_test)
    f1 = f1_score(y_test, y_prediction, average='micro')
    print(f1)

In [11]:
# Classifiers: I have no idea if these settings are good or not,
# might want to do some grid search based tuning
NB = GaussianNB()
DT = DecisionTreeClassifier(random_state=42)
ADA = AdaBoostClassifier(n_estimators=100) 
KN = KNeighborsClassifier(n_neighbors=100)
RF = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
SVM = svm.SVC(gamma='scale')

classifiers = [NB, DT, ADA, KN, RF, SVM]


In [None]:
# If TFIDF
x_train_dense = x_train.todense()
NB_model = train_classifier(NB, x_train_dense, y_train)
x_test_dense = x_test.todense()
test_classifier(NB_model, x_test_dense, y_test)

In [12]:
RF_model = train_classifier(RF, x_train, y_train)
test_classifier(RF_model, x_test, y_test)

0.7933445864480348


In [None]:
SVM_model = train_classifier(SVM, x_train, y_train)
test_classifier(SVM_model, x_test, y_test)

In [None]:
KN_model = train_classifier(KN, x_train, y_train)
test_classifier(KN_model, x_test, y_test)

In [None]:
def train_all_classifiers(classifiers, x_train, y_train):
    # TODO: This will have to do it for both word2vec and tfidf
    classifier_models = []
    for classifier in classifiers:
        print(f'Training with {classifier}')
        # add in if TFIDF, it needs dense for MB
        if classifier is NB:
            try: 
                x_train = x_train.todense()
                model = train_classifier(classifier, x_train, y_train)
            except:
                model = train_classifier(classifier, x_train, y_train)
        classifier_models.append(model)
    return classifier_models


In [None]:
NB_model, DT_model, ADA_model, KN_model, RF_model, SVM_model = train_all_classifiers(classifiers, x_train, y_train)
# BUG: It takes too long, althought the SVM doesn't take so long by itself, wtf?
# Took 60m, wtf? 

In [None]:
classifier_models = [NB_model, DT_model, ADA_model, KN_model, RF_model, SVM_model]

In [None]:
def test_all_classifiers(classifier_models, x_test, y_test):
    for classifier_model in classifier_models:
        print(classifier_model)
        # Also gotta add in also tfidf as condition
        if classifier_model is NB:
            x_test = x_test.todense()
        test_classifier(classifier_model, x_test, y_test)

In [None]:
test_all_classifiers(classifier_models, x_test, y_test)

In [None]:
# TODO:
# test all models (both word2vec and tfidf)
# BUG: when running train_all and test_all for classifiers it slows down massive for SVM, where if I ran it seperately it is normal: WTF?!
# select best
# discuss why all models seemingly perform poorly (how it can be improved)
# CRFs for entity tagging (type the entities?)


# Winner is RF with TFIDF!

In [13]:
def predict_label(classifier_model, utterance):
    utterance = utterance.lower()
    transformed_utterance = vectorizer.transform([utterance])
    predicted_label = classifier_model.predict(transformed_utterance)
    return decode_labels(predicted_label)[0]

In [None]:
# Try it out yourself with an utterance
utterance = 'Turn the livingroom lights off'
label = predict_label(RF_model, utterance)
label

# What does it get wrong and why?

In [41]:
def get_incorrectly_classified_utterances(classifier_model, x_test, y_test):
    y_prediction = classifier_model.predict(x_test)
    utterances = vectorizer.inverse_transform(x_test)
    for utterance, prediction, intent in zip(utterances, decode_labels(y_prediction), decode_labels(y_test)):
        if str(prediction) not in str(intent):
            print(f'{utterance} has been classified as {prediction}, but it should be {intent}')

In [42]:
# Looking at these its important to know, for now I just get the utterance from the transformed ones,
# therefore the word order is mixed up!
get_incorrectly_classified_utterances(RF_model, x_test, y_test)

['about' 'geography' 'london' 'of' 'say'] has been classified as quirky, but it should be factoid
['afternoon' 'from' 'hundred' 'main' 'me' 'meeting' 'my' 'of' 'please'
 'points' 'tell' 'the' 'this' 'two'] has been classified as set, but it should be query
['good' 'movie' 'my' 'on' 'play' 'television'] has been classified as music, but it should be radio
['find' 'girlfriend' 'me' 'perfect' 'the'] has been classified as query, but it should be factoid
['lamp' 'look' 'up'] has been classified as factoid, but it should be definition
['hey' 'today' 'what'] has been classified as query, but it should be greet
['asking' 'for' 'technology' 'this'] has been classified as query, but it should be cleaning
['can' 'itunes' 'my' 'open' 'you'] has been classified as query, but it should be music
['melody'] has been classified as music, but it should be likeness
['around' 'can' 'do' 'dollars' 'for' 'fun' 'me' 'under' 'what'] has been classified as quirky, but it should be events
['facebook' 'messenge

# Discussion
* It is clear to see that some of the utterances suck and some of the intents are overlapping
* I think on a real data set you could see at least 10% improved performance
* In addition
    * if one were to use 'input typing' (entity typing) with the entity tagger
    * then if the types of entities the entity tagger returns are not correct types, this could be used to improve classification
    * Also if required entities are missing, a reply could be given saying a certain entity is required (ie utterance: 'set a timer', response: 'how long of a timer' because expected required entity is duration)
* There are a lot of rules that can be addeded between the two classifiers (intent and entity tagging), that could boost the model, in addition to fine tuning the model itself.

It is my opinion that a random forest with TFIDF and a CRF entity tagger would work fine for NLU tasks, even on under-powered devices, including TRAINING! Here's hoping someone comes along and bangs out a production level NLU engine in a super fast langauge! ;)

https://www.kaggle.com/shoumikgoswami/ner-using-random-forest-and-crf

https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2

For CRFs for the entity tagging

https://nanonets.com/blog/named-entity-recognition-with-nltk-and-spacy/

What abou this?
https://www.mindmeld.com/docs/userguide/intent_classifier.html


DATA: 
https://github.com/xliuhw/NLU-Evaluation-Data/blob/master/AnnotatedData/NLU-Data-Home-Domain-Annotated-All.csv


NLU_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')

NLU_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')