<a href="https://colab.research.google.com/github/EduardoMoraesRitter/NLP-Chatbot/blob/main/analyticsWatson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Update/installs Watson SDK, scikit-learn and termcolor 
#!pip install -U scikit-learn
!pip install termcolor
!pip install "tensorflow-tensorboard<0.2.0,>=0.1.0"
#!pip install "watson-developer-cloud"
!pip install "ibm-watson"
#!pip install --upgrade watson-developer-cloud
!pip install nltk

# Supporting Libs
import re
import os
import sys
import json
import time
import nltk
import sklearn
import itertools
import matplotlib
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import ibm_watson

# Watson APIs Libs
#from watson_developer_cloud import AssistantV1

# Metrics & ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score

# Visualization configs
from termcolor import colored, cprint
from IPython.display import display, HTML
%matplotlib inline
matplotlib.style.use('ggplot')
pd.options.display.max_colwidth = 150

In [6]:
# fill it out with your WAS credentials
WAS_WORKSPACE = ""
WAS_API_KEY = ""
WAS_URL = "https://api.us-east.assistant.watson.cloud.ibm.com"

In [7]:
from ibm_watson import AssistantV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

authenticator = IAMAuthenticator(WAS_API_KEY)
assistant = AssistantV1(
    version='2019-09-20',
    authenticator=authenticator
)

assistant.set_service_url(WAS_URL)
original_workspace_id = WAS_WORKSPACE

In [8]:
# Check if the skill is ready to receive calls, and if it is not waiting 30 seconds and try again. This function blocks the rest of the code!
def check_wksp_status(check_workspace_id):
    wksp_notready = True
    
    while(wksp_notready == True):
        print('Testing workspace...' + check_workspace_id)
        workspace = assistant.get_workspace(workspace_id=check_workspace_id).get_result()

        print('Workspace status: {0}'.format(workspace['status']))
        if workspace['status'] == 'Available':
            wksp_notready = False
            print('Ready to go!')
        else:
            print('In training...wait 30s and try again.')
            time.sleep(30)

# Prints the logs in red and bold
def printred(str_temp,isbold):
    if isbold:
        print(colored(str_temp, 'red', attrs=['bold']))
    else:
        print(colored(str_temp, 'red'))

In [None]:
# Get list of intentions from the previous json export
list_original_intents = assistant.list_intents(workspace_id = original_workspace_id).get_result()

list_original_examples = []
list_original_intent_names = []

# Variable declaration
intent_distribution = pd.DataFrame(columns=['classes', 'size'])
avg_size = 0;

# Assemble distribution view
for idx, intent in enumerate(list_original_intents['intents']):
    examples = assistant.list_examples(
    workspace_id = original_workspace_id,
    intent = list_original_intents['intents'][idx]['intent']
    ).get_result()
    avg_size = avg_size + len(examples['examples'])
    for example in examples['examples']:
        list_original_examples.append(example['text'])
        list_original_intent_names.append(list_original_intents['intents'][idx]['intent'])
        intent_distribution.loc[idx] = pd.Series({'classes':list_original_intents['intents'][idx]['intent'], 'size': len(examples['examples'])})
        #avg_size = avg_size + len(examples['examples'])

# Print the chart on the screen
intent_distribution.plot(kind='bar',x='classes', y='size',figsize=(30,7))

# Mount the data frame
intent_distribution = pd.DataFrame({
    'Example': list_original_examples,
    'Intent': list_original_intent_names
}, columns=['Example','Intent'])

In [None]:
# Mean size of intention
final_avg_size = avg_size/len(list_original_intents['intents'])

print("Average of instances by intention: " + str(final_avg_size))

In [None]:
# Coefficient of discrepancy
cte_coef_disc = 0.5

print(colored("\nIntentions with a discrepancy of examples (intent # examples):\n", attrs=['bold']))

if final_avg_size < 5:
    print(colored(">>> The sample presented does not meet the minimum required for training (5 examples) and, therefore, the deviations will not be calculated.", 'red', attrs=['bold']))
else:
    # Validation of which classes are "offending" the distribution
    for intent in list_original_intents['intents']:
        examples = assistant.list_examples(
        workspace_id = original_workspace_id,
        intent = intent['intent']
        ).get_result()

        diff = float(len(examples['examples'])) - final_avg_size
        if(abs(diff) > (final_avg_size * cte_coef_disc)):
            if diff > 0:
                printred("[+] >>> " + intent['intent'] + ' # ' + str(len(examples['examples'])) + ' / has ' + str(round(diff-(final_avg_size * cte_coef_disc),2)) + ' more examples than expected.',True)
            else:
                printred("[-] >>> " + intent['intent'] + ' # ' + str(len(examples['examples'])) + ' / has ' + str(round(abs(diff)-(final_avg_size * cte_coef_disc),2)) + ' fewer examples than expected.',True)

In [None]:
print(colored("\nIntent without minimal amount of examples (5):\n", attrs=['bold']))

# Verifica se o mínimo exigido foi cumprido
for intent in list_original_intents['intents']:
    examples = assistant.list_examples(
    workspace_id = original_workspace_id,
    intent = intent['intent']
    ).get_result()

    if len(examples['examples']) < 5:
        printred(">>> " + intent['intent'] + ' # ' + str(len(examples['examples'])),True)
        
print(colored("\n\nIntention without minimum amount of examples SUGGESTED (10):\n", attrs=['bold']))

# Checks if the suggested minimum has been met
for intent in list_original_intents['intents']:
    examples = assistant.list_examples(
    workspace_id = original_workspace_id,
    intent = intent['intent']
    ).get_result()

    if len(examples['examples']) < 10:
        printred(">>> " + intent['intent'] + ' # ' + str(len(examples['examples'])),True)

In [None]:
print(colored("\nSelect repeated examples from our training set:\n", attrs=['bold']))

# Mounts example frequency
fdist = nltk.FreqDist(intent_distribution['Example'])        

# Select those with more than one occurrence
repeated = [x for idx,x in intent_distribution.sort_values("Example").iterrows() if x['Example'] in [k for k,v in fdist.items() if v > 1]]
for y in repeated:
    print(y['Example'] + ' # ' + y['Intent'])
    
if len(repeated) <= 2:
    print(colored("There are no repeated examples in our set. Congratulations!", 'green'))

In [26]:
# Auxiliary function that calls WAS to log output
# By default it will bring 100 last interactions, so use the page_limit parameter
def get_logs(wid, pl, to=200):
    response = assistant.list_logs( workspace_id = wid, page_limit = pl ).get_result()
    cursor_regex = r".*?cursor=(.*?)&"
    logs = response['logs']
    page = response['pagination']

    total = 0
    while response:
        total += 1
        cont = 0
        if not page: #'pagination' not in response or 'next_url' not in response['pagination']:
            break
    
        cursor_res = re.search(cursor_regex, page['next_url'], re.IGNORECASE)
        #cursor_res = re.search(cursor_regex, response['pagination']['next_url'], re.IGNORECASE)
        cursor = None
    
        if cursor_res and total<to:
            cursor = cursor_res.group(1)
        if not cursor:
            break
     
        response = assistant.list_logs(workspace_id=wid, page_limit=pl, cursor=cursor).get_result()
        logs += response['logs']
        page = response['pagination']

    return logs

In [None]:
# Gather logs of all conversations made in the workspace
logs = get_logs(original_workspace_id, 1, 10)
len(logs)

In [30]:
# Auxiliary function that calls WAS to log output
# By default it will bring 100 last interactions, so use the page_limit parameter
def mount_logs_dump(logs):
    list_mount_examples = []
    list_mount_intents = []
    list_mount_intents2 = []
    list_mount_confidence = []
    list_mount_confidence2 = []
    list_mount_entities = []
    list_mount_erro = []

    for log in logs:
        
      if log["response"]:
        lresponse = log['response']
        if 'input' in lresponse and 'text' in lresponse['input']:

          if 'output' in lresponse and 'log_messages' in lresponse["output"] and len(lresponse["output"]["log_messages"]) > 0:
            list_mount_erro.append(lresponse["output"]["log_messages"][0]["msg"])
          else:
            list_mount_erro.append('')

          if 'entities' in lresponse and lresponse['entities']:
            list_mount_entities.append(lresponse['entities'][0])
          else:
            list_mount_entities.append('')

          if 'intents' in lresponse and lresponse['intents']:
            list_mount_examples.append(lresponse['input']['text'].strip())
            list_mount_intents.append(lresponse['intents'][0]['intent'])
            list_mount_confidence.append(lresponse['intents'][0]['confidence'])

            if 'alternate_intents' in log['request'] and log['request']['alternate_intents'] == True:
              list_mount_intents2.append(lresponse['intents'][1]['intent'])
              list_mount_confidence2.append(lresponse['intents'][1]['confidence'])
            else:
              list_mount_intents2.append('N/A')
              list_mount_confidence2.append('0')
          else:
            list_mount_examples.append(lresponse['input']['text'].strip())
            list_mount_intents.append('irrelevant')
            list_mount_confidence.append('0')
            list_mount_intents2.append('N/A')
            list_mount_confidence2.append('0')
    

    df_temp = pd.DataFrame({
        'Example': list_mount_examples,
        'Intent_1': list_mount_intents,
        'Confidence_1': list_mount_confidence,
        'Intent_2': list_mount_intents2,
        'Confidence_2': list_mount_confidence2,
        'Entities': list_mount_entities,
        'Erro': list_mount_erro
    }, columns=['Example','Intent_1','Confidence_1','Intent_2','Confidence_2', 'Entities', 'Erro'])
    
    return df_temp

In [None]:
# Gather logs of all conversations made in the workspace
list_logs = mount_logs_dump(logs)
display(list_logs)

# Check for all logs
flag_logs = len(list_logs) > 0

In [16]:
# Mount our data frame for later viewing
if flag_logs:
  dist_logs = pd.DataFrame(columns=['intent', 'sizes','confidencesum', 'entities', 'erro'])

  # There is no need to analyze these
  intent_blacklist = ['greetings','fim']

  for idx,log in list_logs.iterrows():
      if log['Intent_1'] not in intent_blacklist:
          dist_logs.loc[idx] = pd.Series({'intent': ('Erro' if (log['Erro'] != "") else log['Intent_1']), 'entities': log['Entities'], 'erro': log['Erro'], 'sizes': 1, 'confidencesum': float(log['Confidence_1'])})
          dist_logs

In [None]:
# Calculates overall average confidence
if flag_logs:
    print("Average confidence (with irrelevant) = " + str(round(dist_logs['confidencesum'].mean()*100,2)) + " % ")
    #print("Average confidence (without irrelevant) = " + str(round(dist_logs[dist_logs['intent'] != 'irrelevant']['confidencesum'].mean()*100,2)) + " % ")
    print("Average confidence (with irrelevant, without entities) = " + str(round(dist_logs[(dist_logs['entities'] == "")]['confidencesum'].mean()*100,2)) + " % ")
    print("Average confidence (without irrelevant) = " + str(round(dist_logs[dist_logs['confidencesum'] > 0.10]['confidencesum'].mean()*100,2)) + " % ")
    print("Average confidence (without irrelevant, entities) = " + str(round(dist_logs[(dist_logs['confidencesum'] > 0.10) & (dist_logs['entities'] == "")]['confidencesum'].mean()*100,2)) + " % ")
    print("Average confidence (without irrelevant, entities, erro) = " + str(round(dist_logs[(dist_logs['confidencesum'] > 0.10) & (dist_logs['entities'] == "") & (dist_logs['erro'] == "")]['confidencesum'].mean()*100,2)) + " % ")
    
    
    print("Volume:", len(dist_logs), "Erro:", len(dist_logs[dist_logs['erro'] != ""]), "=", str(round((len(dist_logs[dist_logs['erro'] != ""])*100) / len(dist_logs),2)),"%")

In [None]:
# Group intentions, sum events, and calculate the average confidence for each.
if flag_logs:

    dist_logs_filter = dist_logs[dist_logs['entities'] == ""]
    
    # Groups of intentions
    grouped1 = dist_logs_filter.groupby('intent').mean()
    grouped2 = dist_logs_filter.groupby('intent').count()
    
    cte_logs = 5

    print(colored('More asked intentions with more than ' + str(cte_logs) + ' samples\n', attrs=['bold']))
    grouped4 = grouped2.where(lambda x : x['sizes'] > cte_logs).dropna().sort_values(by='sizes')
    
    grouped4.index.name='intent'
    grouped4['intent']=grouped4.index
    
    # Displays the distribution log on the screen
    grouped4.plot(kind='bar',x='intent', y='confidencesum',figsize=(30,7),title='Most-asked intentions with more than ' + str(cte_logs) + ' samples')

In [None]:
if flag_logs:
    # Calculates the intersection between series
    df_intersection = pd.merge(grouped4, grouped1, left_index=True, right_index=True)
    df_intersection.drop('confidencesum_x', axis=1, inplace=True)
    
    print(colored('Average confidence by intention (with more than ' + str(cte_logs) + ' samples):\n', attrs=['bold']))
    display(df_intersection.sort_values(by='confidencesum_y'))

In [None]:
# Prints graph with log distribution vs. general confidence

if flag_logs:
    df_intersection.sort_values(by='confidencesum_y').plot(kind='bar',x='intent', y='confidencesum_y',figsize=(30,7),title='Average confidence by intention (with more than ' + str(cte_logs) + ' samples)')

In [None]:
# Print mixed graphics
if flag_logs:
    # Removes unnecessary columns
    grouped4.drop('confidencesum', axis=1, inplace=True)
    df_intersection.drop('sizes', axis=1, inplace=True)

    ax = grouped4.plot(kind='line', x='intent', y='sizes', figsize=(30,7), color='b', linewidth=3, secondary_y=True, legend=True)
    df_intersection.plot(ax=ax, kind='bar', x='intent', y='confidencesum_y', figsize=(30,7), title='Average confidence by intention x size', legend=True)
    
    plt.gcf().autofmt_xdate()
    plt.show()

In [None]:
MIN_CONFIDENCE = 0.5

if flag_logs:
    counter = 0
    arr_inputs = []
    arr_intents = []
    arr_confidence = []
    arr_repeated = []

    for idx,log in list_logs.iterrows():
        current_intent = log['Intent_1']

        if current_intent not in intent_blacklist and float(log['Confidence_1']) < MIN_CONFIDENCE:
            arr_repeated.append(log['Example'])

            if float(log['Confidence_1']) < MIN_CONFIDENCE and log['Example'] not in arr_inputs:
                counter = counter + 1
                arr_inputs.append(log['Example'])
                arr_intents.append(log['Intent_1'])
                arr_confidence.append(log['Confidence_1'])
    
    low_confidence = pd.DataFrame({
        'Example': arr_inputs,
        'Intent': arr_intents,
        'Confidence': arr_confidence,
    }, columns=['Example','Intent','Confidence'])

    print(display(arr_inputs[0:20]))

In [None]:
if flag_logs:
    print(colored("\nIt selects the examples with greater occurrence (more than 2 repetitions) and less confidence:\n", attrs=['bold']))

    fdist = nltk.FreqDist(arr_repeated)
    flag = False

    for k,v in sorted(fdist.items(), key=lambda t:t[-1], reverse=True):
        if v > 2:
            flag = True
            print("[" + str(v) + "] > ",k)
    if flag is False:
        print("No repeated logs were found.")

In [24]:
# Mounts the dataframe to be worked with
le_examples = []
le_intents = []

for intent in list_original_intents['intents']:
    examples = assistant.list_examples(
    workspace_id = original_workspace_id,
    intent = intent['intent']
    ).get_result()
        
    for ex in examples['examples']:
        le_examples.append(ex['text'])
        le_intents.append(intent['intent'])
        
list_ml_examples = pd.DataFrame({
    'Example': le_examples,
    'Intent': le_intents,
}, columns=['Example','Intent'])

In [None]:
# Because we need to submit some examples to test the template, this task can generate costs if you have already exceeded 10,000 free monthly requests.
length_test_set = len(list_ml_examples)*0.2*3
print("Exemplos: "+ str(len(list_ml_examples)*0.2*3))

print(colored('ATENTION! When you run this test, you may be charged up to US$ ' + str(round(length_test_set*0.0025,2)) + '. Those costs are associated with ' 
              + str(round(length_test_set)) + ' calls to be made to your workspace! Enter OK to accept...','red', attrs=['bold']))

acceptance = input()

if acceptance == 'OK'or acceptance == 'Ok'or acceptance == 'ok':
    print("OK! Let's continue...")

In [None]:
# An auxiliary function that groups intentions together and assembles an array that the WAS can understand. It also validates the number of examples in each intent.
def group_and_mount_intents(train_set):
    
    intents = {}
    
    # Groups of intentions
    for idx, example in train_set.iterrows():
        current_intent = example['Intent']
        
        if current_intent not in intents: 
            intents[current_intent] = []

        intents[current_intent].append(example['Example'])
        

    workspace_intens = []
    
    # Transforms the format of intent into what is accepted by the WAS
    for intent, examples in intents.items():
        entry = {'intent': intent, 'examples': []}
        if len(examples) < 10:
            print("[ATTENTION] Intent #" + intent + " has few examples [" + str(len(examples)) + "] than expected, and therefore is not a good example for this test..")

        for example in examples:
            entry['examples'].append({ 'text': example })

        workspace_intens.append(entry)

    print('\nIntentions mounted.')

    return workspace_intens

# Auxiliary function that creates a new workspace to run tests without damaging the current chatbot
def create_test_workspace(train_set):
    intents_json = group_and_mount_intents(train_set)
    
    entities_list = assistant.list_entities(
        workspace_id = original_workspace_id
    ).get_result()    
    
    response = assistant.create_workspace(name = 'Create_by_Watson_Studio', entities = entities_list['entities'], intents = intents_json, language = 'pt-br')

    print('Skill replicated, wait to be ready...')
        
    #print(response.result["workspace_id"])
    #print(response)

    check_wksp_status(response.result["workspace_id"])

    #check_wksp_status(response["workspace_id"])
    
    return response.result["workspace_id"]


def mount_confusion_matrix(test_set,teste_wid):
    cm_predicted = []
    cm_predicted_2 = []

    cm_conf_p1 = []
    cm_conf_p2 = []
    cm_delta = []

    cm_true = []
    cm_true_q = []
    
    for index, row in test_set.iterrows():
        message = { 'text': row['Example'] }
        
        response = assistant.message(workspace_id=teste_wid,input=message,alternate_intents=True).get_result()

        if response['intents'] != []:
            cm_true_q.append(row['Example'])
            cm_true.append(row['Intent'])
            cm_predicted.append(response['intents'][0]['intent'])
            cm_conf_p1.append(response['intents'][0]['confidence'])

            if len(response['intents']) > 1:
                cm_predicted_2.append(response['intents'][1]['intent'])
                cm_conf_p2.append(response['intents'][1]['confidence'])
                cm_delta.append(float(response['intents'][0]['confidence']) - float(response['intents'][1]['confidence']))
            else:
                cm_predicted_2.append('irrelevant')
                cm_conf_p2.append(0)
                cm_delta.append(1)
    
    resultados = pd.DataFrame({
        'Question': cm_true_q,
        'True': cm_true,
        'Predicted_1': cm_predicted,
        'Predicted_2': cm_predicted_2,
        'Conf_1': cm_conf_p1,
        'Conf_2': cm_conf_p2,
        'Delta': cm_delta
    }, columns=['Question','True','Predicted_1','Predicted_2','Conf_1','Conf_2','Delta','Missed'])

    resultados['Missed'] = resultados.apply(lambda x : 'X' if x['True'] != x['Predicted_1'] else '', axis=1)
    
    return resultados, cm_true, cm_predicted, cm_predicted_2

def plot_confusion_matrix(cm, classes=None, normalize=False, title='', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Matrix of confusion, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
N_ROUNDS = 1
run_wids = []
run_total_df_result = pd.DataFrame(columns=['Question','True','Predicted_1','Predicted_2','Conf_1','Conf_2','Delta','Missed'])
run_total_true = []
run_total_predicted = []
run_total_predicted_2 = []

for run_counter in range(0,N_ROUNDS):
    
    # Divide the training and test sets with a ratio of 80/20
    X_train, X_test, y_train, y_test = train_test_split(list_ml_examples, list_ml_examples.Intent, test_size=0.2)#, stratify=list_ml_examples.Intent)
    
    # Create Skill for test
    test_wid = create_test_workspace(X_train)
    
    run_temp_result, run_temp_true, run_temp_predicted, run_temp_predicted_2 = mount_confusion_matrix(X_test,test_wid)
    
    run_wids.append(test_wid)
    run_total_df_result = pd.concat([run_total_df_result,run_temp_result])
    run_total_true = run_total_true + run_temp_true
    run_total_predicted = run_total_predicted + run_temp_predicted
    run_total_predicted_2 = run_total_predicted + run_temp_predicted_2
    
    run_counter = run_counter + 1
    
    # Automatically deletes the Skill / Workspace created for testing
    assistant.delete_workspace(workspace_id=test_wid)

In [None]:
class_names = run_total_df_result['True'].drop_duplicates().tolist()
class_names.append('irrelevant')

figure_size = (50,50)
mpl.rcParams['figure.figsize'] = figure_size

cnf_matrix = confusion_matrix(run_total_true, run_total_predicted, labels=class_names)
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Matrix of Confusion for Intentions')

In [None]:
print("Precision")
print("Weighted: " + str(precision_score(run_total_true, run_total_predicted, average='weighted')))

print("\n\nRecall")
print("Weighted: " + str(recall_score(run_total_true, run_total_predicted, average='weighted')))

print("\n\nF1")
print("Weighted: " + str(f1_score(run_total_true, run_total_predicted, average='weighted')))

# Metrics for each intent
print(classification_report(run_total_true, run_total_predicted))