In [48]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve,validation_curve
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [9]:
def load_data(file_name):
    return pd.read_csv(file_name, sep=';')

In [14]:
# For this example, we will keep tokenization easy
def preprocess_lower(token):
    #utility for preprocessing
    return token.lower()

In [None]:
def process_training_data(training_data):
    # process the training data and split it between independent and dependent variables
    training_sentences = [list(map(preprocess_lower,sentence.split(" "))) for sentence in list(training_data.Utterence.values)]
    target_class = training_data.Intent.values
    label_encoded_Y = preprocessing.LabelEncoder().fit_transform(list(target_class))
    return target_class, training_sentences, label_encoded_Y

In [28]:
def process_user_query(training_data):
    # process the training data and split it between independent and dependent variables
    training_sentences = [list(map(preprocess_lower,sentence.split(" "))) for sentence in training_data]
    return training_sentences

In [36]:
def train_word2vec_model(train_sentences_list):
    # training word2vec on sentences list (inputted by user)
    model = Word2Vec(train_sentences_list, vector_size=100, window=4, min_count=1, workers=4)
    return model

In [38]:
def convert_training_data_vectors(model, train_sentences_list):
    #get the sentences average vector
    training_sectences_vector = list()
    for sentence in train_sentences_list:
        sentence_vetor = [list(model.wv[token]) for token in sentence if token in model.wv.key_to_index.keys()]
        training_sectences_vector.append(list(np.mean(sentence_vetor, axis=0)))
    return training_sectences_vector

In [None]:
# This code needs to be used with training_rf_prediction_model to make a funtion that trains several models
# TODO: Use the multi-training model function instead of a function for each:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80, random_state=42)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

objects = ('Multi-NB','DTs', 'AdaBoost', 'KNN', 'RF', 'SVM')


# function to train classifier
def train_classifier(clf, X_train, y_train):    
    clf.fit(X_train, y_train)

# function to predict features 
def predict_labels(clf, features):
    return(clf.predict(features))

A = MultinomialNB(alpha=1.0,fit_prior=True)
B = DecisionTreeClassifier(random_state=42)
C = AdaBoostClassifier(n_estimators=100) 
D = KNeighborsClassifier(n_neighbors=1)
E = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
F = svm.SVC(kernel='linear', C=1)


# Loop to call function for each model
# Add compare of best model to export selection in the future
clf = [A,B,C,D,E,F]
cf_labels = ['NB', 'DT', 'ADA', 'KN', 'RF', 'SVM']
pred_val = [0,0,0,0,0,0]

for a in range(0,6):
    train_classifier(clf[a], X_train, y_train)
    y_pred = predict_labels(clf[a],X_test)
    pred_val[a] = f1_score(y_test, y_pred) 
    print(pred_val[a])
    cf_name = cf_labels[a]

In [31]:
def training_rf_prediction_model(training_data_vectors, label_encoded_Y):
    # training model on user inputted data 
    random_forest_model = RandomForestClassifier()
    # here use the split function and divide the data into training and testing 
    x_train,x_test,y_train,y_test=train_test_split(training_data_vectors,label_encoded_Y,
    train_size=0.8,test_size=0.2)
    random_forest_model.fit(x_train, y_train)
    y_pred = random_forest_model.predict(x_test)
    print(accuracy_score(y_test, y_pred))
    return random_forest_model

In [None]:
def training_svm_prediction_model(training_data_vectors, label_encoded_Y):
    svm_model = svm.SVC(gamma='auto')
    svm_model.fit(training_data_vectors, label_encoded_Y)
    return svm_model

In [34]:
def process_data_flow(file_name):
    training_data = load_data(file_name)
    training_data.dropna(axis=0, how='any', subset=['answer_normalised'], inplace=True)
    target_class, training_sentences, label_encoded_Y = process_training_data(training_data)
    word2vec_model = train_word2vec_model(train_sentences_list=training_sentences)
    training_data_vectors = convert_training_data_vectors(word2vec_model, train_sentences_list=training_sentences)
    prediction_model = training_rf_prediction_model(training_data_vectors, label_encoded_Y)
    #intent prediction on user query
    userQuery = ["turn off the lights"]
    user_query_vectors = convert_training_data_vectors(word2vec_model, process_user_query(userQuery))
    predicted_class = prediction_model.predict(user_query_vectors)[0]
    predicted_intent = target_class[list(label_encoded_Y).index(predicted_class)]
    return predicted_intent

In [41]:
# Seperated training stuff to get model
training_data = load_data('NLU-Data-Home-Domain-Annotated-All.csv')
training_data.dropna(axis=0, how='any', subset=['answer_normalised'], inplace=True)
target_class, training_sentences, label_encoded_Y = process_training_data(training_data)
word2vec_model = train_word2vec_model(train_sentences_list=training_sentences)
training_data_vectors = convert_training_data_vectors(word2vec_model, train_sentences_list=training_sentences)
prediction_model = training_rf_prediction_model(training_data_vectors, label_encoded_Y)

0.5893416927899686


In [46]:
# Seperated stuff to run model inference for an utterance
userQuery = ["how do you feel"]
user_query_vectors = convert_training_data_vectors(word2vec_model, process_user_query(userQuery))
predicted_class = prediction_model.predict(user_query_vectors)[0]
predicted_intent = target_class[list(label_encoded_Y).index(predicted_class)]
predicted_intent

'quirky'

In [39]:
print("Predicted class: ", process_data_flow('NLU-Data-Home-Domain-Annotated-All.csv'))

0.5965758379551483
Predicted class:  hue_lightoff


sample_input_data.csv

Utterence,Intent

hi can I have an Apple Watch,service

how much I will be paying monthly,service

you still around,YOU_THERE

are you still there,YOU_THERE

you there,YOU_THERE

Speak to me if you are there,YOU_THERE

you around,YOU_THERE

https://www.kaggle.com/shoumikgoswami/ner-using-random-forest-and-crf

https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2

For CRFs for the entity tagging

https://nanonets.com/blog/named-entity-recognition-with-nltk-and-spacy/

What abou this?
https://www.mindmeld.com/docs/userguide/intent_classifier.html


DATA: 
https://github.com/xliuhw/NLU-Evaluation-Data/blob/master/AnnotatedData/NLU-Data-Home-Domain-Annotated-All.csv


NLU_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')

NLU_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')

In [None]:
# TODO
# train and save state of model (X)
# use saved model to make predictions (X)
# train/test split (is built into training_rf_prediction_model)
# train multiple classifiers (use the code above and training_rf_prediction_model to make mutli-model-training)
# k-folds
# test each one
# select best model
# save model like a pickle or whatever
# add entity tagging code (use CRFs)
# add function that combines both intent and entities for total intent

In [11]:
nlu_data_df = load_data('NLU-Data-Home-Domain-Annotated-All.csv')
nlu_data_df

Unnamed: 0,userid,answerid,scenario,intent,status,answer_annotation,notes,suggested_entities,answer_normalised,answer,question
0,1.0,1.0,alarm,set,,wake me up at [time : five am] [date : this week],,"date, time",wake me up at five am this week,wake me up at 5am this week,Write what you would tell your PDA in the foll...
1,1.0,2.0,alarm,set,,wake me up at [time : nine am] on [date : friday],,"date, time",wake me up at nine am on friday,wake me up at 9am on Friday,Write what you would tell your PDA in the foll...
2,1.0,3.0,alarm,set,,set an alarm for [time : two hours from now],,"date, time",set an alarm for two hours from now,set an alarm for two hours from now,Write what you would tell your PDA in the foll...
3,1.0,31.0,audio,volume_mute,,quiet,,player_setting,quiet,Olly quiet!,Write what you would tell your PDA in the foll...
4,1.0,32.0,audio,volume_mute,IRR_XL,stop,,player_setting,stop,Stop!,Write what you would tell your PDA in the foll...
...,...,...,...,...,...,...,...,...,...,...,...
25711,,781.0,general,praise,,"that's cool, musch appreciated, olly.",,,,"that's cool, musch appreciated, olly.",
25712,,782.0,general,praise,,"you are hero, appreciated.",,,,"you are hero, appreciated.",
25713,,783.0,general,praise,,"thanks, that's nice.",,,,"thanks, that's nice.",
25714,,784.0,general,praise,,"that's cool, thank you so much.",,,,"that's cool, thank you so much.",


In [22]:
def process_training_data(training_data):
    # process the training data and split it between independent and dependent variables
    training_sentences = [list(map(preprocess_lower,sentence.split(" "))) for sentence in list(training_data.answer_normalised.values)]
    target_class = training_data.intent.values
    label_encoded_Y = preprocessing.LabelEncoder().fit_transform(list(target_class))
    return target_class, training_sentences, label_encoded_Y

In [20]:
nlu_data_df.dropna(axis=0, how='any', subset=['answer_normalised'], inplace=True)
nlu_data_df

Unnamed: 0,userid,answerid,scenario,intent,status,answer_annotation,notes,suggested_entities,answer_normalised,answer,question
0,1.0,1.0,alarm,set,,wake me up at [time : five am] [date : this week],,"date, time",wake me up at five am this week,wake me up at 5am this week,Write what you would tell your PDA in the foll...
1,1.0,2.0,alarm,set,,wake me up at [time : nine am] on [date : friday],,"date, time",wake me up at nine am on friday,wake me up at 9am on Friday,Write what you would tell your PDA in the foll...
2,1.0,3.0,alarm,set,,set an alarm for [time : two hours from now],,"date, time",set an alarm for two hours from now,set an alarm for two hours from now,Write what you would tell your PDA in the foll...
3,1.0,31.0,audio,volume_mute,,quiet,,player_setting,quiet,Olly quiet!,Write what you would tell your PDA in the foll...
4,1.0,32.0,audio,volume_mute,IRR_XL,stop,,player_setting,stop,Stop!,Write what you would tell your PDA in the foll...
...,...,...,...,...,...,...,...,...,...,...,...
20844,1010.0,27380.0,email,query,,do i have emails,,,do i have emails,Do I have emails,Write what you would tell your PDA in the foll...
20845,1010.0,27381.0,email,query,,what emails are new,,,what emails are new,What emails are new?,Write what you would tell your PDA in the foll...
20846,1010.0,27400.0,email,query,,do i have new emails from [person : john],,person,do i have new emails from john,Do I have new emails from John,How would you ask your PDA if you have receive...
20847,1010.0,27401.0,email,query,,has [person : john] sent me an email,,person,has john sent me an email,Has John sent me an email?,How would you ask your PDA if you have receive...


In [24]:
process_training_data(nlu_data_df)

(array(['set', 'set', 'set', ..., 'query', 'query', 'query'], dtype=object),
 [['wake', 'me', 'up', 'at', 'five', 'am', 'this', 'week'],
  ['wake', 'me', 'up', 'at', 'nine', 'am', 'on', 'friday'],
  ['set', 'an', 'alarm', 'for', 'two', 'hours', 'from', 'now'],
  ['quiet'],
  ['stop'],
  ['pause', 'for', 'ten', 'seconds'],
  ['pink', 'is', 'all', 'we', 'need'],
  ['make', 'the', 'lighting', 'bit', 'more', 'warm', 'here'],
  ['please', 'set', 'the', 'lighting', 'suitable', 'for', 'reading'],
  ['turn', 'the', 'lights', 'off', 'please'],
  ['time', 'to', 'sleep'],
  ['and', 'the', 'darkness', 'has', 'fallen'],
  ['turn', 'off', 'the', 'light', 'in', 'the', 'bathroom'],
  ['dim', 'the', 'lights', 'in', 'the', 'hall'],
  ['turn', 'the', 'lights', 'off', 'in', 'the', 'bedroom'],
  ['set', 'lights', 'to', 'twenty', 'percent'],
  ['dim', 'the', 'lights', 'in', 'the', 'kitchen'],
  ['make', 'a', 'room', 'darker'],
  ['clean', 'the', 'flat'],
  ["it's", 'dirty', 'here', 'make', 'some', 'noise'],

0            wake me up at five am this week
1            wake me up at nine am on friday
2        set an alarm for two hours from now
3                                      quiet
4                                       stop
                        ...                 
25711                                    NaN
25712                                    NaN
25713                                    NaN
25714                                    NaN
25715                                    NaN
Name: answer_normalised, Length: 25716, dtype: object