In [None]:
from snips_nlu import SnipsNLUEngine
import io
from snips_nlu.default_configs import CONFIG_EN
import json
import pandas as pd
import re

# NLU Snips converter
Quick and dirty notebook to:
* convert the NLU dataset from CSV to JSON format
* train the model
* export the model
* run the engine

A user can skip most steps here if they just want to load the pre-trained model for the whole dataset and test the engine.

## An example NLU model

Load the JSON file with the training data

NOTE: skip this next cell, unless you want to reproduce the data. A model has already been exported

In [None]:
with io.open("lights_dataset.json", "r") as file:
    nlu_data = json.load(file)

print(json.dumps(nlu_data, indent=2))

Load the engine, train the model, and export the data

NOTE: skips this next cell unless you want to reproduce the results 

In [None]:
nlu_engine = SnipsNLUEngine(config=CONFIG_EN)
nlu_engine.fit(nlu_data)
nlu_engine.persist("nlu_model")

Load the engine with the exported model and test an utterance (while we are at it, let's get the top 3 matches to have a deeper look)

In [None]:
loaded_engine = SnipsNLUEngine.from_path("nlu_model")

In [None]:
utterance = "turn on the lights in the kitchen"
parsing = loaded_engine.parse(utterance, top_n=3)
print(json.dumps(parsing, indent=2))

## Dive into the CSV file and convert it into Snips JSON training format

Load the CSV NLU dataset we want to convert

In [None]:
def load_data(file_name):
    data_df = pd.read_csv(file_name, sep=';')
    data_df = data_df.dropna(axis=0, how='any', subset=['answer_normalised'])
    data_df = data_df[data_df['answer_normalised'].str.contains(' ')].reset_index()
    return data_df

In [None]:
nlu_whole_dataset_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')
nlu_whole_dataset_df

### Huge block ahead!
We need to convert this CSV to snips format similar to the example skill

NOTE: the next 3 cell are optional, only run if you want to reproduce the data

In [None]:
is_entity = False

def utterance_data_process(data):
    """
    We get a list of strings like this one:
        ['wake me up at', 'time: five am', '', 'date: this week', '']
    
    The first one is pure text and from there it alternates between entities and pure text.

    For this examples it would be:
        Text, Entity, Text, Entity, Text
    """
    global is_entity
    if is_entity:
        is_entity = not is_entity
        splitted = data.split(' : ')
        try:
            return {
                "entity": splitted[0],
                "slot_name": splitted[0],
                "text": splitted[1]
            }

        except:
            return None

    else:
        is_entity = not is_entity
        return {"text": data}


def split(text):
    return re.split(r"[\[\]]", text)


def intents(csv_dataset):
    def not_empty(utt):
        return utt['text'] != ''

    def not_none(utt):
        return utt is not None

    global is_entity
    intents = {}

    for row in range(len(csv_dataset["scenario"])):
        intent_name = f'{csv_dataset["scenario"][row]}_{csv_dataset["intent"][row]}'

        if not intent_name in intents:
            intents[intent_name] = {"utterances": []}

        utterance_data = list(
            filter(not_empty,
                   filter(not_none,
                          map(utterance_data_process,
                              split(csv_dataset["answer_annotation"][row])
                              )
                          )
                   )
        )
        is_entity = False

        intents[intent_name]["utterances"].append({"data": utterance_data})

    return intents

def extract_entities(utterance):
    def seperate_types_and_entities(entity):
        split_entity = entity.split(' : ')
        entity_type = split_entity[0]
        entity_text = split_entity[1]
        return (entity_type, entity_text)

    entities = re.findall(r'\[(.*?)\]', utterance)
    
    return list(
        map(seperate_types_and_entities, entities)
    )


def convert_entities_to_snips_format(utterance):
    def to_snips(entity):
        return (entity[0], {
            "value": entity[1],
            "synonyms": []
        })

    return list(map(to_snips, extract_entities(utterance)))


def utterances(dataset):
    output = {}
    for utterance in dataset['answer_annotation']:
        for (type, data) in convert_entities_to_snips_format(utterance):
            if type not in output:
                output[type] = {
                    "automatically_extensible": bool('true'),
                    "use_synonyms": bool('true'),
                    'data': []
                }

            output[type]['data'].append(data)

    return output


def to_snips(dataset):
    return {
        "entities": utterances(dataset),
        "intents": intents(dataset),
        "language": "en"
    }

In [None]:
nlu_whole_dataset_snips = to_snips(nlu_whole_dataset_df)

export nlu_whole_dataset_snips to json file (optional)

In [None]:
with open('nlu_whole_dataset_snips.json', 'w') as outfile:
    json.dump(nlu_whole_dataset_snips, outfile)

### Fit the model
NOTE: Skip this unless you want to reproduce the results. An exported model has been provided.

Load the converted NLU dataset, train the model, and export the model (optional)

In [None]:
with io.open("nlu_whole_dataset_snips.json", "r") as file:
    nlu_whole_dataset_snips = json.load(file)

nlu_engine = SnipsNLUEngine(config=CONFIG_EN)
nlu_engine.fit(nlu_whole_dataset_snips)

nlu_engine.persist('nlu_whole_dataset_engine')

### Load and use the model

In [None]:
whole_loaded_engine = SnipsNLUEngine.from_path("nlu_whole_dataset_engine")

In [None]:
parsing = whole_loaded_engine.parse("Can you turn on the lights in the livingroom", top_n=3)
print(json.dumps(parsing, indent=2))

TODO: f1 benchmark for each intent with full report