In [1]:
import pandas as pd
import numpy as np
import re
import time

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report

import nltk


# Aim of this notebook
1. I will probably use this when I have job interviews (as the interviewer) to discuss NLU engines and possibly as a reference if I am ever being interviewed for a position and NLU comes up. It can be hard for me to discuss this technical stuff very specifically that I have done working at companies due to NDAs. And honestly I can't always remember how to do all of this in detail. LOL
2. Generally, I hope it helps developers understand how to build and implement NLU engines. I noticed this deep understanding was missing in the FOSS voice assistant community. I also looked around for a notebook or git repo for basics in NLU engines and couldn't find one (do you know of something? Please feel free to share!). Perhaps some of the methods in here can be used to produce an NLU engine that is both powerful and light enough to run inference and even be trained on low powered devices. Maybe some of my friends who are keen on Java, C++, Rust, etc. want to build a much faster engine.
3. It is always good to have a notebook for benchmarking methods.

# NLU intent classification and entity extraction
Natural language understanding in voice assistants focuses on two problems:
* intent classification
    Where should the utterance (command, question, etc.) go?
    ie the utterance 'turn off the living room lights' should be classified to the intent 'turn off'
* entity extraction (also known as named entity recognition, NER)
    What are the important inputs (entities) that should be passed along
    ie the utterance 'turn off the living room lights', the important entity is the place: 'living room'. 

## TinyML philosopy
The goal of tinyML is to train and run inference of models locally by users. If a user can customize their models, then the system can 'learn' and improve based on users' preferences, instead of a 'one-size-fits-all' way of doing machine learning. 

## Yes, but aren't there already open source voice assistants like Mycroft and Snips/Rhasspy?
Mycroft has two NLU engines:
* Adapt
* Padatious

Adapt focuses on keyword word matching, RegEx patterns, and hard coding to perform these actions. Padatious uses a library called FANN (fast artifical neural network) to classify intent based on all of the words in the utterance and uses the FANN for entity edge detection. They are low powered, so that users could run them on many devices, but they aren't very powerful. 

Rhasspy/Snips uses two intent parsers in tandom:
* deterministic (rule based)
* probabilistic

The rule based approach is only applied when the first one fails to result. The rule based system uses RegEx, requiring the developer to write out these rules. The probablistic system uses logistic regression for intent and conditional random fields (CRFs) for entity extraction. 

Wouldn't it be great to learn how to completely automate these tasks and do it with techniques light enough to run on phones or whatever? I think so. So let's do this!

# Methods
* We are going to use this data set: https://github.com/xliuhw/NLU-Evaluation-Data/blob/master/AnnotatedData/NLU-Data-Home-Domain-Annotated-All.csv
* Detour into Word2Vec method of classifying intent (spolier alert: it doesn't work so well)
* TFIDF encoding (this works pretty well)
* Intent classification: A lot of classifiers to try
    * Logistic Regression
    * Decision Tree Classifier
    * AdaBoost Classifier
    * K-Nearest Neighbors Classifier
    * Random Forest Classifier
    * Support Vector Machine Classifier
    * (Gaussian) Naive Bayes Classifier
* Entity extraction: conditional random fields

And finally, we bring it all together to make our prototype NLU engine. 


# FAQ
* Why didn't use use SPaCy, BERT (or whatever)?
   * I wanted to choose simple stuff that could be easily found in other langauges and is low powered for inference and training, also I tried to write the code as simple as possible, so it would be easy to understand.



# TODO
* record training times and inference times for each model (on raspi4)
* CRF feature stemmer?
* domain classifier? (compare domain to intent classifer?)
* make proper classes out of this to form a generic python NLU engine?


In [None]:
def load_data(file_name):
    data_df = pd.read_csv(file_name, sep=';')
    return data_df.dropna(axis=0, how='any', subset=['answer_normalised'])

# Intent classification

## Let's take a quick look at our dataset

In [None]:
nlu_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')
number_of_intents = nlu_data_df['intent'].nunique()
list_of_intents = nlu_data_df['intent'].unique()
number_of_utterances = nlu_data_df['answer_normalised'].nunique()
print(f'From a total of {number_of_utterances} utterances, there are {number_of_intents} intents')
print(f'List of intents: {list_of_intents}')


Some utterances only have one word or even one letter! We should remove those. 

In [None]:
nlu_data_df = nlu_data_df[nlu_data_df['answer_normalised'].str.contains(' ')]

## Word2Vec (skip this and the next cell if you just want TFIDF which performs better), we will keep tokenization easy 
(keep in mind, other langauges might require more complex tokenization!)


In [None]:
def preprocess_lower(token):
    return token.lower()

def tokenize_utterances(dataframe):
    utterances = list(dataframe.answer_normalised.values)
    return [list(map(preprocess_lower, utterance.split(' '))) for utterance in utterances]

In [None]:
tokenized_utterances = tokenize_utterances(nlu_data_df)

In [None]:
tokenize_utterances

In [None]:
utterances = nlu_data_df['answer_normalised']
## create list of lists of unigrams
list_utterances = []
for utterance in utterances:
   list_words = utterance.split()
   list_grams = [" ".join(list_words[i:i+1]) 
               for i in range(0, len(list_words), 1)]
   list_utterances.append(list_grams)

In [None]:
list_utterances

## The target class labels (for the intents) require encoding to do machine learning stuff

In [None]:
le = preprocessing.LabelEncoder()
def encode_labels(target_class):
    label_encoded_y = le.fit_transform(target_class)
    return label_encoded_y

In [None]:
def decode_labels(label_encoded_y):
    return le.inverse_transform(label_encoded_y)

If you want to predict using domains (skills), change intents to domains and use nlu_data_df.scenario.values

In [None]:
intents = nlu_data_df.intent.values
label_encoded_y = encode_labels(intents)
label_encoded_y

In [None]:
decode_labels(label_encoded_y)

## word2vec to create word vectors from the utterances for the classifiers
Open question: Is this the best word embedding system in terms of performance vs resource usage?

Reasons word2vec was choosen:
* implemented in several programming langauges
* it is well known
* isn't too resource intensive (i.e. it could run in real time on a phone)

However, it might not perform the best, bag of words methods might work better, as word order isn't super important for utterances of a voice assistant (question for the class: why?)

Skip the next 4 cells if you just want the best results, go to TFIDF


In [None]:
def create_word2vec_model(tokenized_utterances):
    model = Word2Vec(tokenized_utterances, vector_size=128, window=2, min_count=1, workers=4)
    return model

In [None]:
word2vec_model = create_word2vec_model(tokenized_utterances)

In [None]:
def convert_utterances_to_vectors(model, tokenized_utterances):
    # get the utterances average vector
    utterances_vectors = list()
    for utterance in tokenized_utterances:
        utterance_vector = [list(model.wv[token]) for token in utterance if token in model.wv.key_to_index.keys()]
        utterances_vectors.append(list(np.mean(utterance_vector, axis=0)))
    return utterances_vectors

In [None]:
word2vec_utterances_vectors = convert_utterances_to_vectors(word2vec_model, tokenized_utterances)

## TFIDF
Question for class: Why does it score better?

Skip this if you are checking out Word2Vec!

In [None]:
vectorizer = TfidfVectorizer()
tfidf_utterances_vectors = vectorizer.fit_transform(nlu_data_df.answer_normalised.values)

## Ohhhh, machine learning!

The classifiers are chosen because:
* Most of these algorithmns exist in other langauges
* They are pretty light (ie can run on a phone not just for inference but for TRAINING custom models!)
* Word order doesn't matter (bag of words style over here)

In [None]:
def train_classifier(classifier, x_train, y_train):
    # TODO: add in training time
    return classifier.fit(x_train, y_train)

def test_classifier(classifier_model, x_test, y_test):
    y_prediction = classifier_model.predict(x_test)
    f1 = f1_score(y_test, y_prediction, average='micro')
    print(f1)

I have no idea if these settings are good or not, might want to do some grid search based tuning or something..

In [None]:
LR = LogisticRegression(solver='liblinear', random_state=0)
DT = DecisionTreeClassifier(random_state=42)
ADA = AdaBoostClassifier(n_estimators=100)
KN = KNeighborsClassifier(n_neighbors=100)
RF = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
SVM = svm.SVC(gamma='scale')
NB = GaussianNB()

classifiers = [LR, DT, ADA, KN, RF, SVM, NB]

## Evaluate all classifiers
Warning this could take a long time, it should only be used to reproduce the reports

Unless you do want to reproduce the reports, skip the next 4 cells 

In [None]:
def cross_validate_classifier(classifier, x_train, y_train):
    start = time.time()
    print(f'Cross validating with {str(classifier)}')
    try:
        if x_train is tfidf_utterances_vectors and classifier is NB:
            # note: I threw NB at the end so it doesn't set them all to dense
                x_train = x_train.todense()
    except:
        pass
    prediction = cross_val_predict(estimator=classifier, X=x_train, y=y_train, cv=5)
    stop = time.time()
    duration = stop - start
    print(f'Time it took to cross validate {str(classifier)}: {duration}')
    return prediction

def generate_report(classifier, prediction, y_train):
    prediction_decoded = decode_labels(prediction).tolist()
    y_train_decoded = decode_labels(y_train).tolist()
    report = classification_report(y_pred=prediction_decoded, y_true=y_train_decoded, output_dict=True)
    print(f'Generating report for {classifier}')
    return report

def convert_report_to_df(classifier, report):
    df = pd.DataFrame(report).transpose()
    df['classifier'] = str(classifier)
    df.index = df.index.set_names(['intent'])
    df = df.reset_index()
    return df

def evaluate_classifier(classifier, x_train, y_train):
    prediction = cross_validate_classifier(classifier, x_train, y_train)
    report = generate_report(classifier, prediction, y_train)
    return convert_report_to_df(classifier, report)

def evaluate_all_classifiers(classifiers, x_train, y_train):
    for count, classifier in enumerate(classifiers):
        df = evaluate_classifier(classifier, x_train, y_train)
        if count is 0:
            concat_df = df
        else:
            concat_df = pd.concat([concat_df, df])
    return concat_df



In [None]:
report_all_intent_classifiers_tfidf_df = evaluate_all_classifiers(classifiers, tfidf_utterances_vectors, label_encoded_y)

Label the encoding type
`tfidf`, `word2vec`, or something else. 

In [None]:
report_all_intent_classifiers_tfidf_df['encoding'] = 'tfidf'

Format the dataframe and seperate overview

In [None]:
report_all_intent_classifiers_tfidf_df['classifier'] = report_all_intent_classifiers_tfidf_df['classifier'].str.replace(r"\([^()]*\)", "")
report_all_intent_classifiers_overview_df = report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['intent'].str.contains('accuracy|avg')]
report_all_intent_classifiers_overview_df = report_all_intent_classifiers_overview_df.rename(columns={'intent': 'measure'})
report_all_intent_classifiers_overview_df = report_all_intent_classifiers_overview_df.to_csv('report_all_intent_classifiers_overview.csv', index=False)


report_all_intent_classifiers_tfidf_df = report_all_intent_classifiers_tfidf_df[~report_all_intent_classifiers_tfidf_df['intent'].str.contains('accuracy|avg')]
report_all_intent_classifiers_tfidf_df.to_csv('report_all_intent_classifiers_tfidf.csv', index=False)


## Let's load up our reports and take a look

It looks like SVM scores slightly higher than LR, but the trade-off for performance is worth it with LR.

In [4]:
report_all_intent_classifiers_overview_df = pd.read_csv('report_all_intent_classifiers_overview.csv')
report_all_intent_classifiers_overview_df[report_all_intent_classifiers_overview_df['measure'].str.contains('accuracy')].sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,measure,precision,recall,f1-score,support,classifier,encoding
15,accuracy,0.763347,0.763347,0.763347,0.763347,SVC,tfidf
18,accuracy,0.75956,0.75956,0.75956,0.75956,LogisticRegression,tfidf
21,accuracy,0.754481,0.754481,0.754481,0.754481,XGBClassifier,tfidf
3,accuracy,0.75129,0.75129,0.75129,0.75129,RandomForestClassifier,tfidf
12,accuracy,0.669834,0.669834,0.669834,0.669834,KNeighborsClassifier,tfidf
6,accuracy,0.663709,0.663709,0.663709,0.663709,DecisionTreeClassifier,tfidf
0,accuracy,0.436701,0.436701,0.436701,0.436701,GaussianNB,tfidf
9,accuracy,0.408585,0.408585,0.408585,0.408585,AdaBoostClassifier,tfidf


In [5]:
report_all_intent_classifiers_tfidf_df = pd.read_csv('report_all_intent_classifiers_tfidf.csv')

Taking a closer look at our LR, we can see that some intents score pretty poorly (the ones with low support): 


In [6]:
report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['classifier'].str.contains('LogisticRegression')].sort_values(by=['f1-score'])

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
318,volume_other,0.0,0.0,0.0,24.0,LogisticRegression,tfidf
284,dislikeness,0.0,0.0,0.0,25.0,LogisticRegression,tfidf
288,greet,0.0,0.0,0.0,21.0,LogisticRegression,tfidf
280,convert,1.0,0.175258,0.298246,97.0,LogisticRegression,tfidf
305,quirky,0.425422,0.327345,0.369994,1002.0,LogisticRegression,tfidf
292,hue_lighton,0.916667,0.289474,0.44,38.0,LogisticRegression,tfidf
311,settings,0.866667,0.325,0.472727,80.0,LogisticRegression,tfidf
295,likeness,0.819149,0.385,0.52381,200.0,LogisticRegression,tfidf
304,querycontact,0.809917,0.447489,0.576471,219.0,LogisticRegression,tfidf
285,events,0.832402,0.468553,0.599598,318.0,LogisticRegression,tfidf


What about RF?

In some sparse cases it can fair better.

In [7]:
report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['classifier'].str.contains('RandomForest')].sort_values(by=['f1-score'])

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
54,dislikeness,0.375,0.12,0.181818,25.0,RandomForestClassifier,tfidf
75,quirky,0.482824,0.239357,0.320051,1057.0,RandomForestClassifier,tfidf
88,volume_other,0.75,0.25,0.375,24.0,RandomForestClassifier,tfidf
81,settings,0.733333,0.275,0.4,80.0,RandomForestClassifier,tfidf
58,greet,0.642857,0.375,0.473684,24.0,RandomForestClassifier,tfidf
55,events,0.730769,0.354037,0.476987,322.0,RandomForestClassifier,tfidf
74,querycontact,0.741379,0.38914,0.510386,221.0,RandomForestClassifier,tfidf
68,movies,0.796296,0.383929,0.518072,112.0,RandomForestClassifier,tfidf
65,likeness,0.733333,0.431373,0.54321,204.0,RandomForestClassifier,tfidf
66,locations,0.751634,0.449219,0.562347,256.0,RandomForestClassifier,tfidf


'dislikeness' scores pretty poorly with every classifier

In [8]:
report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['intent'].str.contains('dislikeness')].sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
238,dislikeness,0.857143,0.24,0.375,25.0,SVC,tfidf
330,dislikeness,0.555556,0.2,0.294118,25.0,XGBClassifier,tfidf
54,dislikeness,0.375,0.12,0.181818,25.0,RandomForestClassifier,tfidf
100,dislikeness,0.130435,0.12,0.125,25.0,DecisionTreeClassifier,tfidf
8,dislikeness,0.056,0.28,0.093333,25.0,GaussianNB,tfidf
146,dislikeness,0.025974,0.08,0.039216,25.0,AdaBoostClassifier,tfidf
192,dislikeness,0.0,0.0,0.0,25.0,KNeighborsClassifier,tfidf
284,dislikeness,0.0,0.0,0.0,25.0,LogisticRegression,tfidf


Same for 'quirky'

In [9]:
report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['intent'].str.contains('quirky')].sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
259,quirky,0.50731,0.328288,0.398621,1057.0,SVC,tfidf
351,quirky,0.463115,0.338323,0.391003,1002.0,XGBClassifier,tfidf
305,quirky,0.425422,0.327345,0.369994,1002.0,LogisticRegression,tfidf
75,quirky,0.482824,0.239357,0.320051,1057.0,RandomForestClassifier,tfidf
121,quirky,0.281407,0.264901,0.272904,1057.0,DecisionTreeClassifier,tfidf
29,quirky,0.191565,0.253548,0.218241,1057.0,GaussianNB,tfidf
213,quirky,0.458716,0.047304,0.085763,1057.0,KNeighborsClassifier,tfidf
167,quirky,0.0,0.0,0.0,1057.0,AdaBoostClassifier,tfidf


'volume_other'

In [10]:
report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['intent'].str.contains('volume_other')].sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
272,volume_other,0.75,0.375,0.5,24.0,SVC,tfidf
134,volume_other,0.615385,0.333333,0.432432,24.0,DecisionTreeClassifier,tfidf
364,volume_other,0.473684,0.375,0.418605,24.0,XGBClassifier,tfidf
88,volume_other,0.75,0.25,0.375,24.0,RandomForestClassifier,tfidf
42,volume_other,0.179487,0.291667,0.222222,24.0,GaussianNB,tfidf
180,volume_other,0.0,0.0,0.0,24.0,AdaBoostClassifier,tfidf
226,volume_other,0.0,0.0,0.0,24.0,KNeighborsClassifier,tfidf
318,volume_other,0.0,0.0,0.0,24.0,LogisticRegression,tfidf


'settings'

In [11]:
report_all_intent_classifiers_tfidf_df[report_all_intent_classifiers_tfidf_df['intent'].str.contains('settings')].sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
357,settings,0.678571,0.475,0.558824,80.0,XGBClassifier,tfidf
265,settings,0.790698,0.425,0.552846,80.0,SVC,tfidf
311,settings,0.866667,0.325,0.472727,80.0,LogisticRegression,tfidf
81,settings,0.733333,0.275,0.4,80.0,RandomForestClassifier,tfidf
127,settings,0.37037,0.375,0.372671,80.0,DecisionTreeClassifier,tfidf
35,settings,0.131818,0.3625,0.193333,80.0,GaussianNB,tfidf
219,settings,0.5,0.0125,0.02439,80.0,KNeighborsClassifier,tfidf
173,settings,0.0,0.0,0.0,80.0,AdaBoostClassifier,tfidf


## Which classifiers score the best for each intent, arranged by support (how sparse the intent examples are)
Note: this excludes SVM (due to run time), but generally SVM would score the best.

Skip the next cell, it is there just to reproduce the report

In [12]:
report_best_classifier_per_intent_tfidf_df = report_all_intent_classifiers_tfidf_df[~report_all_intent_classifiers_tfidf_df['classifier'].str.contains('SVC')].groupby('intent').apply(lambda x: x.sort_values(by=['f1-score'], ascending=False).head(1)).sort_values(by=['support'])
report_best_classifier_per_intent_tfidf_df.to_csv('report_best_classifier_per_intent_tfidf.csv', index=False)

In [14]:
report_best_classifier_per_intent_tfidf_df = pd.read_csv('report_best_classifier_per_intent_tfidf.csv')
report_best_classifier_per_intent_tfidf_df

Unnamed: 0,intent,precision,recall,f1-score,support,classifier,encoding
0,volume_other,0.615385,0.333333,0.432432,24.0,DecisionTreeClassifier,tfidf
1,greet,0.555556,0.416667,0.47619,24.0,DecisionTreeClassifier,tfidf
2,dislikeness,0.555556,0.2,0.294118,25.0,XGBClassifier,tfidf
3,hue_lighton,0.714286,0.526316,0.606061,38.0,RandomForestClassifier,tfidf
4,volume_down,0.916667,0.60274,0.727273,73.0,LogisticRegression,tfidf
5,wemo_on,0.981481,0.6625,0.791045,80.0,LogisticRegression,tfidf
6,settings,0.678571,0.475,0.558824,80.0,XGBClassifier,tfidf
7,addcontact,0.760563,0.606742,0.675,89.0,RandomForestClassifier,tfidf
8,convert,0.794118,0.556701,0.654545,97.0,XGBClassifier,tfidf
9,wemo_off,0.84375,0.826531,0.835052,98.0,RandomForestClassifier,tfidf


In [15]:
report_best_classifier_per_intent_tfidf_df['classifier'].value_counts()

LogisticRegression        17
XGBClassifier             14
RandomForestClassifier    13
DecisionTreeClassifier     2
Name: classifier, dtype: int64

# Discussion
* It is clear to see that some of the utterances are poorly written or otherwise incorrect and some of the intents are overlapping
* I think on a real data set you could see at least ~10% improved performance
* There are a lot of rules that can be addeded between the two classifiers (intent and entity tagging), that could boost the model, in addition to fine tuning the model itself.

It is my opinion that a random forest or logistic regression with TFIDF and a CRF entity tagger would work fine for NLU tasks, even on under-powered devices, including TRAINING!

# For TFIDF the winner is LR (except for sparse data), followed by XGB and RF!
Unless pure performance is your goal, then SVM for the win. But it is nice to balance out performance vs speed (training and inference)

## We shall use RF as an example classifier.

In [None]:
RF_model = train_classifier(RF, tfidf_utterances_vectors, label_encoded_y)

Predict the intent label from an utterance

In [None]:
def predict_label(classifier_model, utterance):
    utterance = utterance.lower()
    transformed_utterance = vectorizer.transform([utterance])
    predicted_label = classifier_model.predict(transformed_utterance)
    return decode_labels(predicted_label)[0]

In [None]:
# Try it out yourself with an utterance
utterance = 'Turn the living room lights off'
label = predict_label(RF_model, utterance)
label

## What does it get wrong and why?

In [None]:
def get_incorrectly_classified_utterances(classifier_model, utterances, tfidf_utterances_vectors, label_encoded_y):
    y_prediction = classifier_model.predict(tfidf_utterances_vectors)
    for utterance, prediction, intent in zip(utterances, decode_labels(y_prediction), decode_labels(label_encoded_y)):
        if str(prediction) not in str(intent):
            print(f'{utterance} has been classified as {prediction}, but it should be {intent}')

In [None]:
nlu_data_df[(nlu_data_df['answer_normalised'].str.fullmatch('a')) & (nlu_data_df['intent'].str.contains('factoid'))]
# TODO: for future cleaning we can for sure get rid of the following:
#answer id: 19126.0, 21940.0, 21942.0, 25765.0, 4274.0
# go by user ID too? ie 981.0, 107.0?

In [None]:
get_incorrectly_classified_utterances(RF_model, nlu_data_df['answer_normalised'].tolist(), tfidf_utterances_vectors, label_encoded_y)

This makes it very clear that the data set is not clean enough to ensure good results. A future cleanup will be required (also perhaps seperating the intents a bit better)

# Entity Extraction

First we need to get the entities from the utterances with their taggings

In [None]:
def seperate_types_and_entities(entities):
    entity_list = []
    for entity in entities:
        split_entity = entity.split(' : ')
        entity_type = split_entity[0]
        entity_text = split_entity[1].split(' ')
        entity_list.append({'type':entity_type, 'words': entity_text})
    return entity_list

def extract_entities(utterance):
    entities = re.findall(r'\[(.*?)\]', utterance)
    return seperate_types_and_entities(entities)

In [None]:
utterance_with_tagging = 'wake me up at [time : five pm] [date : this week]'

entities = extract_entities(utterance_with_tagging)
entities

## POS tagging and entity labeling of utterances
Conditional random fields just love features. One of the most obvious features we could give it besides the words themselves are the part of speech (POS) tags of the words!

In [None]:
def pos_tag_utterance(utterance):
    tokenized_utterance = nltk.word_tokenize(utterance)
    utterance_pos = nltk.pos_tag(tokenized_utterance)
    return utterance_pos

In [None]:
utterance = 'wake me up at five pm this week'
utterance_pos = pos_tag_utterance(utterance)
utterance_pos

In [None]:
def combine_pos_and_entity_tags(entities, utterance_pos):
    output = []
    words = []

    for entity in entities:
        for word in entity['words']:
            words.append(word)

    for pair in utterance_pos:
        word = pair[0]
        pos = pair[1]
        for entity in entities:
            if word in entity['words']:
                entity_type = entity['type']
                output.append((word, pos, entity_type))
            elif word not in words and entity is entities[-1]:
                entity_type = '0'
                output.append((word, pos, entity_type))
    return output

In [None]:
combine_pos_and_entity_tags(entities, utterance_pos)

## Now let's put it all together to make our features

In [None]:
def create_feature_dataset(nlu_data_df):
    feature_dataset = []
    for utterance, utterance_with_tagging in zip(nlu_data_df['answer_normalised'], nlu_data_df['answer_annotation']):
        print(utterance)
        entities = extract_entities(utterance_with_tagging)
        utterance_pos = pos_tag_utterance(utterance)
        feature_dataset.append(combine_pos_and_entity_tags(entities, utterance_pos))
    return feature_dataset

In [None]:
# TODO: try removing the word-level slicing (I don't think prefixes and suffixes give more info in English)
# TODO: try add stemming (lemma?) or something as an extra feature?
def word2features(utterance, i):
    word = utterance[i][0]
    postag = utterance[i][1]

    features = {
        'bias': 1.0,
        'word': word,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = utterance[i-1][0]
        postag1 = utterance[i-1][1]
        features.update({
            '-1:word': word1,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(utterance)-1:
        word1 = utterance[i+1][0]
        postag1 = utterance[i+1][1]
        features.update({
            '+1:word': word1,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def utterance2features(utterance):
    return [word2features(utterance, i) for i in range(len(utterance))]

def utterance2labels(utterance):
    return [label for token, postag, label in utterance]

def utterance2tokens(utterance):
    return [token for token, postag, label in utterance]

## Analysis: it is easy to see that for entities with few examples, the results are very poor.

Unless you want to reproduce the report, you can skip the next 3 cells.

In [None]:
feature_dataset = create_feature_dataset(nlu_data_df)
feature_dataset

In [None]:
X = [utterance2features(utterance) for utterance in feature_dataset]
y = [utterance2labels(utterance) for utterance in feature_dataset]

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [None]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y, output_dict=True)

df = pd.DataFrame(report).transpose()
df.index = df.index.set_names(['entity-type'])
df = df.reset_index()
df.to_csv('analysis_of_CRF_for_entity_extraction.csv', index=False)


In [2]:
analysis_CRF_entity_extraction_df = pd.read_csv('analysis_of_CRF_for_entity_extraction.csv')

Generally our CRF performs poorly for entities with few examples. This could be optimized by using additional features.

In [3]:
analysis_CRF_entity_extraction_df.sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,entity-type,precision,recall,f1-score,support
0,0,0.889049,0.958075,0.922272,76803.0
12,currency_name,0.943467,0.876313,0.908651,857.0
51,timeofday,0.86037,0.91886,0.888653,456.0
57,accuracy,0.844292,0.844292,0.844292,0.844292
13,date,0.829982,0.846817,0.838315,4289.0
59,weighted avg,0.816216,0.844292,0.826035,106141.0
24,house_place,0.854592,0.793839,0.823096,422.0
15,device_type,0.810845,0.818066,0.81444,786.0
49,time,0.803602,0.790729,0.797114,2934.0
28,meal_type,0.754545,0.747748,0.751131,111.0


## Let's make our model

We will remove the entities with the fewest examples and the others that score 0

In [None]:
remove_strings = ['audiobook_author', 'audiobook_name', 'cooking_type', 'drink_type', 'email_address', 'email_folder', 'game_name', 'game_type', 'ingredient', 'movie_name', 'movie_type', 'music_album', 'music_descriptor', 'news_topic', 'personal_info', 'podcast_descriptor', 'podcast_name', 'query_detail', 'radio_name', 'song_name', 'sport_type', 'transport_descriptor', 'transport_name', 'transport_type']

nlu_data_entities_cleaned_df = nlu_data_df[~nlu_data_df['answer_annotation'].str.contains('|'.join(remove_strings))]
nlu_data_entities_cleaned_df
# TODO: fix the one entity type label with no space after entity type (ie type: thing -> type : thing)
# TODO: change nlu_data_df to cleaned for rest of code

In [None]:
feature_dataset = create_feature_dataset(nlu_data_entities_cleaned_df)
X = [utterance2features(utterance) for utterance in feature_dataset]
y = [utterance2labels(utterance) for utterance in feature_dataset]

In [None]:
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

In [None]:
crf_model = crf.fit(X, y)

## If we want to use this as an entity extraction engine, we will need to get the entities, their types, and the location of the entities in the utterance

In [None]:
def get_entities(utterance):
    utterance_pos = pos_tag_utterance(utterance)
    utterance_features = utterance2features(utterance_pos)
    label = crf_model.predict_single(utterance_features)
    return label

In [None]:
get_entities(utterance)

In [None]:
def get_entity_types_and_locations(utterance):
    entity_locations_and_types = []
    entities = get_entities(utterance)
    for location, entity in enumerate(entities):
        if entity is not '0':
            entity_locations_and_types.append((location, entity))
    return entity_locations_and_types

def get_entity_tags(utterance):
    entity_locations_and_types = get_entity_types_and_locations(utterance)
    split_utterance = utterance.split(' ')
    tagged_entities = [(entity_type, split_utterance[location]) for location, entity_type in entity_locations_and_types]
    return tagged_entities


In [None]:
utterance = 'set an alarm for five pm'
get_entity_tags(utterance)

# Now let's bring it all together, a full NLU engine!

In [None]:
# TODO: Maybe give this function a better name?
def get_NLU_results(utterance):
    tagged_entities  = get_entity_tags(utterance)
    return [utterance, predict_label(RF_model, utterance), tagged_entities]

Random test utterances I could come up with, maybe add some of your own and see what happens

In [None]:
utterances = [
    'vacuum the bathroom',
    'clean the hall',
    'what is the weather like this weekend',
     'what is the weather like in munich tomorrow',
     'what is the temperature',
     'will it rain today',
     'turn off the kitchen lights',
     'turn on the living room lights',
     'set an alarm for five pm',
     'set an alarm for ten am',
     'what time is it in new york',
     'what time is it in berlin in two hours from now',
     'tell me a joke',
     'how are you',
     'when was biden born',
     'how long does it take to boil an egg',
     'how do you make a caesar salad',
     'how much is a euro in dollars'
]

for utterance in utterances:
    print(get_NLU_results(utterance))