In [41]:
import converters
import analyzers
import numpy as np
import pandas as pd
import time
import tqdm
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [17]:
small_test_path = 'data/hwu_small_test.csv'
small_train_path = 'data/hwu_small_train.csv'
large_test_path = 'data/hwu_large_test.csv'
large_train_path = 'data/hwu_large_train.csv'

In [None]:
docs, X, y = converters.parse_data(small_train_path)

df = pd.DataFrame({'X': X, 'y': y})
df.to_csv('hwu_watson_small.csv', header=None, columns=['X', 'y'], index=False)

In [37]:
docs, X, y = converters.parse_data(large_train_path)

df = pd.DataFrame({'X': X, 'y': y})
df.to_csv('hwu_watson_large.csv', header=None, columns=['X', 'y'], index=False)

In [5]:
!pip install --upgrade "ibm-watson>=5.2.2"

Collecting ibm-watson>=5.2.2
  Downloading ibm-watson-5.2.3.tar.gz (406 kB)
[K     |████████████████████████████████| 406 kB 744 kB/s 
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting websocket-client==1.1.0
  Downloading websocket_client-1.1.0-py2.py3-none-any.whl (68 kB)
[K     |████████████████████████████████| 68 kB 1.9 MB/s 
Collecting ibm-cloud-sdk-core==3.*,>=3.3.6
  Downloading ibm-cloud-sdk-core-3.11.3.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 2.5 MB/s 
[?25hCollecting PyJWT<3.0.0,>=2.0.1
  Downloading PyJWT-2.1.0-py3-none-any.whl (16 kB)
Building wheels for collected packages: ibm-watson, ibm-cloud-sdk-core
  Building wheel for ibm-watson (PEP 517) ... [?25ldone
[?25h  Created wheel for ibm-watson: filename=ibm_watson-5.2.3-py3-none-any.whl size=403336 sha256=30ab54d59ccd9f94bd3c9495f1b8b97d62ccd0f719e619dbf7ea3fe249bac14

In [1]:
import json
from ibm_watson import AssistantV2
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

In [64]:
def calc_results(test_path):
    docs, X, y = converters.parse_data(test_path)
    results = []
    cant_classify_count = 0
    times = []
    
    # get IAMAuthenticator key from Watson web interface
    authenticator = IAMAuthenticator('')
    assistant = AssistantV2(
        version='2021-06-14',
        authenticator = authenticator
    )

    assistant.set_service_url('https://api.eu-de.assistant.watson.cloud.ibm.com')
    for sent in tqdm.tqdm(X):
        start_time = time.time()
        # get assistant_id key from Watson web interface
        result = assistant.message_stateless(
            assistant_id='',
            input={
                'message_type': 'text',
                'text': sent,
                'options': {'alternate_intents': True}
            }
        ).get_result()
        times.append(time.time() - start_time)

        if not result['output']['intents']:
            cant_classify_count += 1
            print(sent)
            print(result)
            break
            result = {'output': {'intents': [{'intent': 'Other'}]}}            
        results.append(result)
    return results, times, cant_classify_count

In [65]:
results, times, cant_classify_count = calc_results(large_test_path)

100%|██████████| 5518/5518 [16:40<00:00,  5.52it/s]


In [66]:
len(results), cant_classify_count

(5518, 0)

In [60]:
_, _, y = converters.parse_data(small_test_path)
y_pred = []
for r in results:
    y_pred.append(r['output']['intents'][0]['intent'])
print(classification_report(y, y_pred))
print("Accuracy: ", accuracy_score(y, y_pred)) 
print("F1-Score: ", f1_score(y, y_pred, average='macro')) 
print(f"Mean response time: {np.mean(times)} +- {np.std(times)} sec.")

                          precision    recall  f1-score   support

             alarm_query       0.89      0.89      0.89        19
            alarm_remove       0.36      0.45      0.40        11
               alarm_set       0.78      0.74      0.76        19
       audio_volume_down       0.50      0.88      0.64         8
       audio_volume_mute       0.73      0.53      0.62        15
         audio_volume_up       0.67      0.62      0.64        13
          calendar_query       0.36      0.47      0.41        19
         calendar_remove       0.89      0.84      0.86        19
            calendar_set       0.77      0.53      0.62        19
          cooking_recipe       0.80      0.84      0.82        19
        datetime_convert       0.50      0.75      0.60         8
          datetime_query       0.88      0.74      0.80        19
        email_addcontact       0.53      1.00      0.70         8
             email_query       0.77      0.89      0.83        19
      ema

In [67]:
_, _, y = converters.parse_data(large_test_path)
y_pred = []
for r in results:
    y_pred.append(r['output']['intents'][0]['intent'])
print(classification_report(y, y_pred))
print("Accuracy: ", accuracy_score(y, y_pred)) 
print("F1-Score: ", f1_score(y, y_pred, average='macro')) 
print(f"Mean response time: {np.mean(times)} +- {np.std(times)} sec.")

                          precision    recall  f1-score   support

             alarm_query       0.97      0.91      0.94        94
            alarm_remove       0.84      0.80      0.82        54
               alarm_set       0.81      0.86      0.84        96
       audio_volume_down       0.72      0.95      0.82        40
       audio_volume_mute       0.93      0.66      0.77        76
         audio_volume_up       0.87      0.89      0.88        73
          calendar_query       0.62      0.57      0.59        95
         calendar_remove       0.86      0.87      0.87       102
            calendar_set       0.72      0.86      0.78        91
          cooking_recipe       0.89      0.92      0.90       107
        datetime_convert       0.81      1.00      0.90        47
          datetime_query       0.82      0.93      0.87        92
        email_addcontact       0.79      0.89      0.84        38
             email_query       0.85      0.91      0.88        93
      ema