# Data
- spacy's input vector

# Training Process

In [None]:
import pandas as pd
import numpy as np
import spacy
from joblib import dump, load
from sklearn.neural_network import MLPClassifier
from preprocess_data import raw_to_json, unzip_entities, split_entities, tags_patterns_mix, get_responses, remove_fallback

#### Data

In [None]:
df_train = pd.read_csv('data/Training - Training.csv')
df_test = pd.read_csv('data/Training - Test.csv')
intents = raw_to_json(df_train)
parent_tags = ['navigate', 'find', 'action']
data = unzip_entities(intents, parent_tags)
responses = get_responses(data)
data = remove_fallback(data)
tags_patterns = tags_patterns_mix(data)

In [None]:
nlp = spacy.load("en_core_web_sm")
def lemmatizer(text):
    doc = nlp(text)
    return [d.lemma_ for d in doc]

def train_pipeline(tags_patterns):
    
    word_list = []
    tags = []
    word_tag_data = []
    word_vector = []
    
    for i, row in tags_patterns.iterrows():
        tag = row['tag']
        pattern = row['pattern']
        
        tags.append(tag)
        word = lemmatizer(pattern)
        word_list.extend(word)
        word_tag_data.append((word, tag))
        word_vector.append(nlp(pattern))
        
    return tags, word_list, word_tag_data, word_vector

def prepare_training_data(word_tag_data, word_list, tags, word_vector):
    X = []
    y = []
    
    for (pattern, tag) in word_tag_data:
        bog = bag_of_words(pattern, word_list)
        vec = nlp(' '.join(pattern)).vector
        
        vec = np.array(vec)
        x = vec
        
        X.append(x)
        label = tags.index(tag)
        y.append(label)

    # X = np.squeeze(X, axis=1)
    return np.array(X), np.array(y)

def bag_of_words(tokenized_sentence, words):

    bog = np.zeros(len(words), dtype=np.float32)
    for idx, word in enumerate(words):
        if word in tokenized_sentence:
            bog[idx] = 1
    return bog

def pipe_new_input(text):
    
    if text == '': #otherwise error
        text = ' '

    text = lemmatizer(text)
    vec = nlp(' '.join(text)).vector
    
    bog = bag_of_words(text, word_list)
    
    bog = np.array(bog)
    vec = np.array(vec)
    
    # x = np.vstack((bog.reshape(-1, 1), vec.reshape(-1, 1))).reshape(1, -1)
    x = vec.reshape(1, -1)
    # x = bog.reshape(1, bog.shape[0])
    
    return x
    
def predict(text, model):
    x = pipe_new_input(text)
    result = model.predict_proba(x)
    return result

def get_tag_from_prediction(result, tags, threshold, fallback = 'fallback'):
    
    max_proba = np.max(result)
    
    if max_proba < threshold:
        return fallback
    
    predicted_tag = tags[np.argmax(result)]
    
    return predicted_tag
    
def test(df, model, tags, threshold = 0.4, col_name = 'Sentence', col_tag = 'Tag'):
    predicted_tags = []
    matches = []
    probas = []
    predicted_tags_if_not_fallback = []
    
    for i, row in df.iterrows():
        sentence = row[col_name]
        tag = row[col_tag]
        
        result = predict(sentence, model)
        max_proba = np.max(result)
        
        predicted_tag = get_tag_from_prediction(result, tags, threshold)
        
        predicted_tag_except_fallback = get_tag_from_prediction(result, tags, 0.0)
        
        predicted_tags_if_not_fallback.append(predicted_tag_except_fallback)
        probas.append(max_proba)
        predicted_tags.append(predicted_tag)
        matches.append(predicted_tag == tag)

    df = df.assign(predicted_tags = predicted_tags, matches = matches, probas = probas, predicted_tags_if_not_fallback = predicted_tags_if_not_fallback)
    
    acc = df['matches'].sum() / len(df)
    output = f'Accuracy: {acc}'
    
    return df, output

def get_random_response_from_tag(tag, responses):
    return np.random.choice(responses[tag])

### Data preparation

In [None]:
tags, word_list, word_tag_data, word_vector = train_pipeline(tags_patterns)
IGNORE = ['?', '!', '.', ',']
word_list = [word for word in word_list if word not in IGNORE]
word_list = sorted(set(word_list))
tags = sorted(set(tags))
X, y = prepare_training_data(word_tag_data, word_list, tags, word_vector)

### Model training & testing

In [None]:
clf = MLPClassifier(random_state=1, activation = 'logistic', max_iter=50000, hidden_layer_sizes = (16)).fit(X, y)

### Testing
- calculates the accuracy of the model on test data
- plots the dataframe with all misclassified data

In [None]:
df_test_result, acc = test(df_test, clf, tags, col_name = 'Sentence', col_tag = 'Tag', threshold = 0.25)
print(acc)

Accuracy: 0.43333333333333335

In [None]:
df_test_result[(df_test_result['matches'] == False)].to_markdown()

|    | Sentence                                      | Tag                    | Tag_parent           | predicted_tags     | matches   |   probas | predicted_tags_if_not_fallback   |
|---:|:----------------------------------------------|:-----------------------|:---------------------|:-------------------|:----------|---------:|:---------------------------------|
|  3 | Where can I view my feed?                     | find_personal_feed     | find                 | find_edit_profile  | False     | 0.842587 | find_edit_profile                |
|  9 | What do I have to do to publish my own trade? | find_create_trade      | find                 | find_edit_profile  | False     | 0.955156 | find_edit_profile                |
| 10 | supercalifragilisticexpialidocious            | fallback               | fallback             | help               | False     | 0.914312 | help                             |
| 11 | Get me to the homepage.                       | navigate_homepage      | navigate             | action_logout      | False     | 0.797475 | action_logout                    |
| 13 | Show my personal feed.                        | navigate_personal_feed | navigate             | find_personal_feed | False     | 0.925741 | find_personal_feed               |
| 14 | View the overall feed.                        | navigate_public_feed   | navigate             | action_create_post | False     | 0.960393 | action_create_post               |
| 15 | I want to sign in.                            | navigate_auth          | navigate             | action_logout      | False     | 0.976178 | action_logout                    |
| 16 | I would like to register.                     | navigate_auth          | navigate             | action_logout      | False     | 0.535739 | action_logout                    |
| 17 | Expose the page where I can make a post.      | navigate_create_post   | navigate             | find_create_trade  | False     | 0.942616 | find_create_trade                |
| 18 | Display the page for creating a trade.        | navigate_create_trade  | navigate             | find_auth          | False     | 0.691495 | find_auth                        |
| 21 | Hello bot.                                    | greeting               | greeting             | find_homepage      | False     | 0.403676 | find_homepage                    |
| 22 | That's all. Bye.                              | goodbye                | goodbye              | action_logout      | False     | 0.515347 | action_logout                    |
| 23 | Great. Thanks for your help.                  | thanks                 | thanks               | help               | False     | 0.389974 | help                             |
| 24 | I don't know what to do.                      | help                   | help                 | joke               | False     | 0.822507 | joke                             |
| 25 | I'm bored. Tell me something funny.           | joke                   | joke                 | action_create_post | False     | 0.69906  | action_create_post               |
| 26 | What's C2G?                                   | about_c2g              | about_c2g            | find_follow        | False     | 0.573128 | find_follow                      |
| 29 | Alexa is better than you.                     | about_bot_other_bots   | about_bot_other_bots | action_logout      | False     | 0.976478 | action_logout                    |

### Saving the model & data

In [None]:
dump(clf, 'utils/classifier.model');
dump(responses, 'utils/responses.data');
dump(tags, 'utils/tags.data');
dump(word_list, 'utils/word_list.data');

### Testing the model with own input

In [None]:
txt = 'Get me to my homepage'
r = predict(txt, clf)
pred = get_tag_from_prediction(r, tags, 0.20)
get_random_response_from_tag(pred, responses)

'navigate::profile'