In [None]:
import pandas as pd
import numpy as np
import re
import time

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve,validation_curve
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report

import nltk


# Aim of this notebook
1. I will probably use this when I have job interviews (as the interviewer) to discuss NLU engines and possibly as a reference if I am ever being interviewed for a position and NLU comes up. It can be hard for me to discuss this technical stuff very specifically that I have done working at companies due to NDAs. And honestly I can't always remember how to do all of this in detail. LOL
2. Generally, I hope it helps developers understand how to build and implement NLU engines. I noticed this deep understanding was missing in the FOSS voice assistant community. I also looked around for a notebook or git repo for basics in NLU engines and couldn't find one (do you know of something? Please feel free to share!). Perhaps some of the methods in here can be used to produce an NLU engine that is both powerful and light enough to run inference and even be trained on low powered devices. Maybe some of my friends who are keen on Java, C++, Rust, etc. want to build a much faster engine.

# NLU intent classification and entity extraction
A natural language understanding in voice assistants focus on two problems:
* intent classification
    Where should the utterance (command, question, etc.) go?
    ie the utterance 'turn off the living room lights' should be classified to 'turn off'
* entity extraction (also known as named entity recognition, NER)
    What are the important inputs (entities) that should be passed along
    ie the utterance 'turn off the living room lights', the important entity is the place: 'living room'. 

## That's great, but don't voice assistants already have this ability? 
Yes, main stream voice assistants (Siri, Alexa, Bixby, etc.) do have this ability. However the training and inference of these general models are controlled by those companies and users can't easily add or change much, and some people have privacy concerns about running voice assistants since they don't run locally. 

## TinyML philosopy
The goal of tinyML is to train and run inference of models locally by users. If a user can customize their models, then the system can 'learn' and improve based on users' preferences, instead of a 'one-size-fits-all' way of doing machine learning. 

## Yes, but aren't there already open source voice assistants like Mycroft and Snips/Rhasspy?
Totally, but do you know how they work?

Mycroft has two NLU engines:
* Adapt
* Padatious

Adapt focuses on keyword word matching, RegEx patterns, and hard coding to perform these actions. Padatious uses a library called FANN (fast artifical neural network) to classify intent based on all of the words in the utterance and uses the FANN for entity edge detection. However, I haven't found that these systems perform so well (I will run a comparetitve analysis to compare results in the future). They are low powered, so that users could run them on many devices, but they aren't very powerful. 

Rhasspy/Snips uses two intent parsers in tandom:
* deterministic (rule based)
* probabilistic

If the rule based approach is only applied when the first one fails to result. The rule based system uses RegEx, requiring the developer to write out these rules. The probablistic system uses linear regression for intent and conditional random fields (CRFs) for entity extraction. 

Wouldn't it be great to learn how to completely automate these tasks and do it with techniques light enough to run on phones or whatever? I think so. So let's do this!

# Methods
* We are going to use this data set: https://github.com/xliuhw/NLU-Evaluation-Data/blob/master/AnnotatedData/NLU-Data-Home-Domain-Annotated-All.csv
* Detour into Word2Vec method of classifying intent (spolier alert: it doesn't work so well)
* TFIDF encoding (this works pretty well)
* Intent classification: A lot of classifiers to try
    * (Gaussian) Naive Bayes Classifier
    * Decision Tree Classifier
    * AdaBoost Classifier
    * K-Nearest Neighbors Classifier
    * Random Forest Classifier
    * Support Vector Machine Classifier
    * maybe more in the future (XGBoost?)
* Entity extraction: conditional random fields

And finally, we bring it all together to make our NLU engine. 


# FAQ
* Why didn't use use SPaCy, BERT (or whatever)?
   * I wanted to choose simple stuff that could be easily found in other langauges and is low powered for inference and training



# TODO: refactoring
* record training times and inference times for each model
* generate report for each, save as df
* concat all reports into df
* return the highest scoring classifier vs training (or inference) times
* do the same for word2vec
* XGBoost?
* clean up code
* CRF feature stemmer?
* domain classifier? (compare domain to intent classifer?)
* make proper classes out of this to form a python NLU engine?


In [None]:
def load_data(file_name):
    data_df = pd.read_csv(file_name, sep=';')
    return data_df.dropna(axis=0, how='any', subset=['answer_normalised'])

# Intent classification

## Let's take a quick look at our dataset

In [None]:
nlu_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')
number_of_intents = nlu_data_df['intent'].nunique()
list_of_intents = nlu_data_df['intent'].unique()
number_of_utterances = nlu_data_df['answer_normalised'].nunique()
print(f'From a total of {number_of_utterances} utterances, there are {number_of_intents} intents')
print(f'List of intents: {list_of_intents}')


## Word2Vec (skip this and the next cell if you just want TFIDF which performs better), we will keep tokenization easy 
(keep in mind, other langauges might require more complex tokenization!)


In [None]:
def preprocess_lower(token):
    return token.lower()

def tokenize_utterances(dataframe):
    utterances = list(dataframe.answer_normalised.values)
    return [list(map(preprocess_lower, utterance.split(' '))) for utterance in utterances]

In [None]:
tokenized_utterances = tokenize_utterances(nlu_data_df)

## The target class labels (for the intents) require encoding to do machine learning stuff

In [None]:
le = preprocessing.LabelEncoder()
def encode_labels(target_class):
    label_encoded_y = le.fit_transform(list(target_class))
    return label_encoded_y

In [None]:
def decode_labels(label_encoded_y):
    return le.inverse_transform(label_encoded_y)

If you want to predict using domains (skills), change intents to domains and use nlu_data_df.scenario.values

In [None]:
intents = nlu_data_df.intent.values
label_encoded_y = encode_labels(intents)
label_encoded_y

In [None]:
decode_labels(label_encoded_y)

## word2vec to create word vectors from the utterances for the classifiers
Open question: Is this the best word embedding system in terms of performance vs resource usage?

Reasons word2vec was choosen:
* implemented in several programming langauges
* it is well known
* isn't too resource intensive (i.e. it could run in real time on a phone)

However, it might not perform the best, bag of words methods might work better, as word order isn't super important for utterances of a voice assistant (question for the class: why?)

Skip the next 4 cells if you just want the best results, go to TFIDF


In [None]:
def create_word2vec_model(tokenized_utterances):
    model = Word2Vec(tokenized_utterances, vector_size=128, window=2, min_count=1, workers=4)
    return model

In [None]:
word2vec_model = create_word2vec_model(tokenized_utterances)

In [None]:
def convert_utterances_to_vectors(model, tokenized_utterances):
    # get the utterances average vector
    utterances_vectors = list()
    for utterance in tokenized_utterances:
        utterance_vector = [list(model.wv[token]) for token in utterance if token in model.wv.key_to_index.keys()]
        utterances_vectors.append(list(np.mean(utterance_vector, axis=0)))
    return utterances_vectors

In [None]:
word2vec_utterances_vectors = convert_utterances_to_vectors(word2vec_model, tokenized_utterances)

## TFIDF
Question for class: Why does it score better?

Skip this if you are checking out Word2Vec!

In [None]:
vectorizer = TfidfVectorizer()
tfidf_utterances_vectors = vectorizer.fit_transform(nlu_data_df.answer_normalised.values)

# TODO: return_train_test_split requires len for word2vec and shape for TFIDF
# TODO: NB requires todense for TFIDF
# TODO: Add normal bag of words and compare all of them for all models(?)

## (optional) In case you want to do a train_test_split (for proper evaluation, we are using cross validation otherwise)
'What is my purpose?'

'You split the data.'

*sad robot noises*

In [None]:
def return_train_test_split(utterances_vectors, label_encoded_y):
    x_train, x_test, y_train, y_test = train_test_split(utterances_vectors, label_encoded_y, train_size=0.8,test_size=0.2)
    # if Word2Vec use len
    # if TFIDF use shape
    try:
        number_of_training = x_train.shape[0]
        number_of_testing = x_test.shape[0]
    except:
        number_of_training = len(x_train)
        number_of_testing = len(x_test)
    # (TODO: Add parameter for switching?)
    print(f"Training set has {number_of_training} samples.")
    print(f"Testing set has {number_of_testing} samples.")
    return x_train, x_test, y_train, y_test

In [None]:
# Make sure to specifiy whether you ar using tfidf or word2vec for your utterances_vectors!
x_train, x_test, y_train, y_test = return_train_test_split(utterances_vectors, label_encoded_y)

## Ohhhh, machine learning!

The classifiers are chosen because:
* Most of these algorithmns exist in other langauges
* They are pretty light (ie can run on a phone not just for inference but for TRAINING custom models!)
* Word order doesn't matter (bag of words style over here)

In [None]:
def train_classifier(classifier, x_train, y_train):
    # TODO: add in training time
    return classifier.fit(x_train, y_train)

def test_classifier(classifier_model, x_test, y_test):
    y_prediction = classifier_model.predict(x_test)
    f1 = f1_score(y_test, y_prediction, average='micro')
    print(f1)

I have no idea if these settings are good or not, might want to do some grid search based tuning or something..

In [None]:
# TODO: add more like LinearRegression
DT = DecisionTreeClassifier(random_state=42)
ADA = AdaBoostClassifier(n_estimators=100)
KN = KNeighborsClassifier(n_neighbors=100)
RF = RandomForestClassifier()
SVM = svm.SVC(gamma='scale')
NB = GaussianNB()

classifiers = [DT, ADA, KN, RF, SVM, NB]

## Evaluate all classifiers
Warning this could take a long time, it should only be used to reproduce the reports

Unless you do want to reproduce the reports, skip the next two cells 

In [None]:
def cross_validate_classifier(classifier, x_train, y_train):
    start = time.time()
    print(f'Cross validating with {str(classifier)}')
    if x_train is tfidf_utterances_vectors and classifier is NB:
        # note: I threw NB at the end so it doesn't set them all to dense
            x_train = x_train.todense()
    prediction = cross_val_predict(estimator=classifier, X=x_train, y=y_train, cv=5)
    stop = time.time()
    duration = stop - start
    print(f'Time it took to cross validate {str(classifier)}: {duration}')
    return prediction

def generate_report(classifier, prediction, y_train):
    prediction_decoded = decode_labels(prediction).tolist()
    y_train_decoded = decode_labels(y_train).tolist()
    report = classification_report(y_pred=prediction_decoded, y_true=y_train_decoded, output_dict=True)
    print(f'Generating report for {classifier}')
    return report

def convert_report_to_df(classifier, report):
    df = pd.DataFrame(report).transpose()
    df['classifier'] = str(classifier)
    df.index = df.index.set_names(['intent'])
    df = df.reset_index()
    return df

def evaluate_classifier(classifier, x_train, y_train):
    prediction = cross_validate_classifier(classifier, x_train, y_train)
    report = generate_report(classifier, prediction, y_train)
    return convert_report_to_df(classifier, report)

def evaluate_all_classifiers(classifiers, x_train, y_train):
    for count, classifier in enumerate(classifiers):
        df = evaluate_classifier(classifier, x_train, y_train)
        if count is 0:
            concat_df = df
        else:
            concat_df = pd.concat([concat_df, df])
    return concat_df



In [None]:
report_all_intent_classififiers_tfidf_df = evaluate_all_classifiers(classifiers, tfidf_utterances_vectors, label_encoded_y)

## Let's load up our report and take a look

It looks like SVM scores slightly higher than RF, but the trade-off for performance is worth it with RF.

In [None]:
report_all_intent_classififiers_tfidf_df = pd.read_csv('analysis_of_all_intent_classifiers_with_tfidf.csv')
report_all_intent_classififiers_tfidf_df[report_all_intent_classififiers_tfidf_df['intent'].str.contains('accuracy')].sort_values(by=['f1-score'], ascending=False)

Taking a closer look at our RF, we can see that some intents score pretty poorly: 


In [None]:
report_all_intent_classififiers_tfidf_df[report_all_intent_classififiers_tfidf_df['classifier'].str.contains(str(RF))].sort_values(by=['f1-score'])

'dislikeness' scores pretty poorly with every classifier

In [None]:
report_all_intent_classififiers_tfidf_df[report_all_intent_classififiers_tfidf_df['intent'].str.contains('dislikeness')].sort_values(by=['f1-score'], ascending=False)

Same for 'quirky'

In [None]:
report_all_intent_classififiers_tfidf_df[report_all_intent_classififiers_tfidf_df['intent'].str.contains('quirky')].sort_values(by=['f1-score'], ascending=False)

'volume_other'

In [None]:
report_all_intent_classififiers_tfidf_df[report_all_intent_classififiers_tfidf_df['intent'].str.contains('volume_other')].sort_values(by=['f1-score'], ascending=False)

'settings'

In [None]:
report_all_intent_classififiers_tfidf_df[report_all_intent_classififiers_tfidf_df['intent'].str.contains('settings')].sort_values(by=['f1-score'], ascending=False)

In [None]:
# test all models: haven't tested ALL word2vec yet

# Discussion
* It is clear to see that some of the utterances suck and some of the intents are overlapping
* I think on a real data set you could see ~10% improved performance
* In addition
    * if one were to use 'input typing' (entity typing) with the entity tagger
    * then if the types of entities the entity tagger returns are not correct types, this could be used to improve classification
    * Also if required entities are missing, a reply could be given saying a certain entity is required (ie utterance: 'set a timer', response: 'how long of a timer' because expected required entity is duration)
* There are a lot of rules that can be addeded between the two classifiers (intent and entity tagging), that could boost the model, in addition to fine tuning the model itself.

It is my opinion that a random forest with TFIDF and a CRF entity tagger would work fine for NLU tasks, even on under-powered devices, including TRAINING! Here's hoping someone comes along and bangs out a production level NLU engine in a super fast langauge! ;)

# Winner is RF with TFIDF!
Unless pure performance is your goal, then SVM for the win. But it is nice to balance out performance vs speed (training and inference)

Train the RF model

In [None]:
RF_model = train_classifier(RF, tfidf_utterances_vectors, label_encoded_y)

Predict the intent label from an utterance

In [None]:
def predict_label(classifier_model, utterance):
    utterance = utterance.lower()
    transformed_utterance = vectorizer.transform([utterance])
    predicted_label = classifier_model.predict(transformed_utterance)
    return decode_labels(predicted_label)[0]

In [None]:
# Try it out yourself with an utterance
utterance = 'Turn the livingroom lights off'
label = predict_label(RF_model, utterance)
label

## What does it get wrong and why?

In [None]:
def get_incorrectly_classified_utterances(classifier_model, utterances, tfidf_utterances_vectors, label_encoded_y):
    y_prediction = classifier_model.predict(tfidf_utterances_vectors)
    for utterance, prediction, intent in zip(utterances, decode_labels(y_prediction), decode_labels(label_encoded_y)):
        if str(prediction) not in str(intent):
            print(f'{utterance} has been classified as {prediction}, but it should be {intent}')

In [None]:
nlu_data_df['answer_normalised'].tolist()

In [None]:
nlu_data_df[(nlu_data_df['answer_normalised'].str.fullmatch('a')) & (nlu_data_df['intent'].str.contains('factoid'))]

#remove: answer id: 19126.0, 21940.0, 21942.0, 25765.0, 4274.0
# go by user ID too? ie 981.0, 107.0

In [None]:
# Looking at these its important to know
# It is easy to see there is a lot of garbage in this data set that hurts performance
get_incorrectly_classified_utterances(RF_model, nlu_data_df['answer_normalised'].tolist(), tfidf_utterances_vectors, label_encoded_y)

# Entity Extraction

First we need to get the entities from the utterances with their taggings

In [None]:

def seperate_types_and_entities(entities):
    entity_list = []
    for entity in entities:
        split_entity = entity.split(' : ')
        entity_type = split_entity[0]
        entity_text = split_entity[1].split(' ')
        entity_list.append({'type':entity_type, 'words': entity_text})
    return entity_list

def extract_entities(utterance):
    # match [...]: \[[^][]*]
    entities = re.findall(r'\[(.*?)\]', utterance)
    return seperate_types_and_entities(entities)
    # extract label and entity

In [None]:
utterance_with_tagging = 'wake me up at [time : five pm] [date : this week]'

entities = extract_entities(utterance_with_tagging)
entities

## POS tagging and entity labeling of utterances
Conditional random fields just love features. One of the most obvious features we could give it besides the words themselves are the part of speech (POS) tags of the words!

In [None]:
def pos_tag_utterance(utterance):
    tokenized_utterance = nltk.word_tokenize(utterance)
    utterance_pos = nltk.pos_tag(tokenized_utterance)
    return utterance_pos

In [None]:
utterance = 'wake me up at five pm this week'
utterance_pos = pos_tag_utterance(utterance)
utterance_pos

In [None]:
def combine_pos_and_entity_tags(entities, utterance_pos):
    output = []
    words = []

    for entity in entities:
        for word in entity['words']:
            words.append(word)

    for pair in utterance_pos:
        word = pair[0]
        pos = pair[1]
        for entity in entities:
            if word in entity['words']:
                entity_type = entity['type']
                output.append((word, pos, entity_type))
            elif word not in words and entity is entities[-1]:
                entity_type = '0'
                output.append((word, pos, entity_type))
    return output

In [None]:
combine_pos_and_entity_tags(entities, utterance_pos)

## Now let's put it all together to rip the features out in the whole dataframe

In [None]:

def create_feature_dataset(nlu_data_df):
    feature_dataset = []
    for utterance, utterance_with_tagging in zip(nlu_data_df['answer_normalised'], nlu_data_df['answer_annotation']):
        print(utterance)
        entities = extract_entities(utterance_with_tagging)
        utterance_pos = pos_tag_utterance(utterance)
        feature_dataset.append(combine_pos_and_entity_tags(entities, utterance_pos))
    return feature_dataset

In [None]:
feature_dataset = create_feature_dataset(nlu_data_df)
feature_dataset

In [None]:
# TODO: refactor code
# try removing the word-level slicing (I don't think prefixes and suffixes give more info in English)
# try add stemming (lemma?) or something as an extra feature?
def word2features(utterance, i):
    word = utterance[i][0]
    postag = utterance[i][1]

    features = {
        'bias': 1.0,
        'word': word,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = utterance[i-1][0]
        postag1 = utterance[i-1][1]
        features.update({
            '-1:word': word1,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(utterance)-1:
        word1 = utterance[i+1][0]
        postag1 = utterance[i+1][1]
        features.update({
            '+1:word': word1,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def utterance2features(utterance):
    return [word2features(utterance, i) for i in range(len(utterance))]

def utterance2labels(utterance):
    return [label for token, postag, label in utterance]

def utterance2tokens(utterance):
    return [token for token, postag, label in utterance]

In [None]:
X = [utterance2features(utterance) for utterance in feature_dataset]
y = [utterance2labels(utterance) for utterance in feature_dataset]

## Analysis: it is easy to see that for entities with few examples, the results are very poor.

Unless you want to reproduce the report, you can skip this next cell

In [None]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
# TODO: Use this for every classifier for intent also
report = flat_classification_report(y_pred=pred, y_true=y, output_dict=True)

df = pd.DataFrame(report).transpose()
df.index = df.index.set_names(['entity-type'])
df = df.reset_index()
df.to_csv('analysis_of_CRF_for_entity_extraction.csv', index=False)

# TODO: Load as CSV


We will remove the entities with the fewest examples

In [None]:
# TODO: get counts for each entity type, then drop these:
remove_strings = ['audiobook_author', 'audiobook_name', 'cooking_type', 'drink_type', 'email_address', 'email_folder', 'game_name', 'game_type', 'ingredient', 'movie_name', 'movie_type', 'music_album', 'music_descriptor', 'news_topic', 'personal_info', 'podcast_descriptor', 'podcast_name', 'query_detail', 'radio_name', 'song_name', 'sport_type', 'transport_descriptor', 'transport_name', 'transport_type']

nlu_data_entities_cleaned_df = nlu_data_df[~nlu_data_df['answer_annotation'].str.contains('|'.join(remove_strings))]
nlu_data_entities_cleaned_df
# TODO: fix the one entity type label with no space after entity type (ie type: thing -> type : thing)
# TODO: change nlu_data_df to cleaned for rest of code

In [None]:
feature_dataset = create_feature_dataset(nlu_data_entities_cleaned_df)
X = [utterance2features(utterance) for utterance in feature_dataset]
y = [utterance2labels(utterance) for utterance in feature_dataset]

## Let's make our model

In [None]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

In [None]:
crf_model = crf.fit(X, y)

## If we want to use this as an entity extraction engine, we will need to get the entities, their types, and the location of the entities in the utterance

In [None]:
def get_entities(utterance):
    utterance_pos = pos_tag_utterance(utterance)
    utterance_features = utterance2features(utterance_pos)
    label = crf_model.predict_single(utterance_features)
    return label

In [None]:
get_entities(utterance)

In [None]:
def get_entity_types_and_locations(utterance):
    entity_locations_and_types = []
    entities = get_entities(utterance)
    for location, entity in enumerate(entities):
        if entity is not '0':
            entity_locations_and_types.append((location, entity))
    return entity_locations_and_types

def get_entity_tags(utterance):
    entity_locations_and_types = get_entity_types_and_locations(utterance)
    split_utterance = utterance.split(' ')
    tagged_entities = [(entity_type, split_utterance[location]) for location, entity_type in entity_locations_and_types]
    return tagged_entities


In [None]:
utterance = 'set an alarm for five pm'
get_entity_tags(utterance)

# Now let's bring it all together, a full NLU engine!

In [None]:
# TODO: Maybe give this function a better name?
def get_NLU_stuff(utterance):
    tagged_entities  = get_entity_tags(utterance)
    return [utterance, predict_label(RF_model, utterance), tagged_entities]



Random test utterances I could come up with, maybe add some of your own and see what happens

In [None]:
utterances = [
    'vacuum the bathroom',
    'clean the hall',
    'what is the weather like this weekend',
     'what is the weather like in munich tomorrow',
     'what is the temperature',
     'will it rain today',
     'turn off the kitchen lights',
     'turn on the living room lights',
     'set an alarm for five pm',
     'set an alarm for ten am',
     'what time is it in new york',
     'what time is it in berlin in two hours from now',
     'tell me a joke',
     'how are you',
     'when was biden born',
     'how long does it take to boil an egg',
     'how do you make a caesar salad',
     'how much is a euro in dollars'
]

for utterance in utterances:
    print(get_NLU_stuff(utterance))