In [1]:
import pandas as pd
import requests
import json
import time
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from collections import defaultdict
import hashlib

In [2]:
data_path = 'data/chinese_general.csv'

In [3]:
data_df = pd.read_csv(data_path)
data_df['Tag'] = data_df['Tag'].fillna(data_df['Question'])
data_df = data_df[['Question', 'Tag']]
data_df.columns = ['intent', 'question']
data_df.head()

Unnamed: 0,intent,question
0,可供購買產品,可供購買產品
1,可供購買產品,我有咩可以買?
2,可供購買產品,你地有咩產品provide?
3,可供購買產品,我可以喺邊度睇到產品資料
4,可供購買產品,依家有咩產品提供?


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data_df['question'], data_df['intent'],
                                                    test_size=0.4, random_state=42)

In [5]:
train_df = pd.DataFrame({'intent': y_train, 'question': X_train})
train_df = train_df.reset_index(drop=True)
train_df['intent'] = [hashlib.sha224(x).hexdigest() for x in train_df['intent']]
train_df.head()

Unnamed: 0,intent,question
0,9e8517f88fbcc89b0b7ac8005adc3d837a2855e2e09eb4...,建立戶口
1,61a890677d6f8184da7013233f9e2ebb746f4ddc8afe65...,要點先攞到我嘅帳戶資料?
2,be6b5bdd853ab3f0118f2fc78753695ca280f95e316dd9...,自動轉帳
3,51227ab034ea805ed3c430f1703afcc1e99c0e3b1689c1...,我點樣投訴?
4,45bc064f7604cd508d0da5a3489a65d79e3b98345f1e9a...,我想update電郵地址


In [6]:
variations = defaultdict(list)

for intent, question in zip(train_df['intent'], train_df['question']):
    variations[intent].append(question)

In [7]:
test_df = pd.DataFrame({'intent': y_test, 'question': X_test})
test_df = test_df.reset_index(drop=True)
test_df['intent'] = [hashlib.sha224(x).hexdigest() for x in test_df['intent']]
test_df.head()

Unnamed: 0,intent,question
0,bbd37e89792d16bcbd6f0ef4009514793a58460684d2cd...,間舖頭可以幫我做咩?
1,063ccfb6af96000b19ecd7e5a83703e1863fcbc134580b...,有咩可以比我買?
2,9e8517f88fbcc89b0b7ac8005adc3d837a2855e2e09eb4...,開account有咩requirement?
3,b97e2b7df6e7dd6daba7d7ab488317608d33d11169274b...,驗證付款
4,b2bfe4907a054f96a16c957b289ca3baa104a248497345...,我想apply分期付款


In [8]:
len(data_df)

294

In [9]:
len(train_df)

176

In [10]:
len(test_df)

118

In [11]:
len(set(train_df['intent']))

36

# Clare

In [61]:
key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiJjZDg4ZWMyMC02MGM1LTQ5NzMtOTBlMi1kOGU2NTU1ZDQ1MTUiLCJpc3MiOiJDbGFyZV9BSSIsImF1ZCI6IkNsYXJlX0FJIn0.UwOBPd3Ml4vjD0CGtuf1A1TQubSMkZn_KR-2oDnUKHU'
headers = {'Authorization': 'Bearer {}'.format(key),
           'Accept': 'application/json',
           'Content-Type': 'application/json'}

In [62]:
# Add intent category

data = {
    'language': 'zh-hk',
    'name': 'test',
    'feedback': True,
    'suggestion': True,
    'active': True,
    'ordering': 0,
    'confidenceOverride': False,
    'nerDisabled': False
}

r = requests.post('https://hk-demo56.clare.ai/api/v1/AddOrUpdateIntentCategory', data=json.dumps(data), headers=headers)
category_id = json.loads(r.text)['categoryId']

In [63]:
# Add intents

for intent in variations:

    data = {
        'categoryId': category_id,
        'language': 'zh-hk',
        'question': intent,
        'answer': '-',
        'answerFacebook': {},
        'labels': [],
        'userSays': variations[intent],
        'active': True,
    }
    
    r = requests.post('https://hk-demo56.clare.ai/api/v1/AddOrUpdateIntent', data=json.dumps(data), headers=headers)

In [64]:
# Test questions

start = time.time()

correct, pred = [], []
false_pos = 0
for i, question in enumerate(test_df['question']):
    
    payload = {'dbName': 'hkDemo56',
               'logging': 1,
               'classify_intent': 3,
               'autoCorrection': 0,
               'memoryRecentConversations': 0,
               'query': question}
    
    r = requests.get('http://35.160.77.29:58074/word2vec/most_similar_sentence_zh', params=payload)
    
    if json.loads(r.text)['results'][0]['Question'] == test_df['intent'][i]:
        correct.append(1)
    else:
        if json.loads(r.text)['results'][0]['Score'] > 0.8:
            false_pos += 1
        correct.append(0)

end = time.time()

print('# Tested: {}'.format(len(correct)))
print('# Correct: {}'.format(sum(correct)))
print('# Wrong: {}'.format(len(correct) - sum(correct)))
print('% correct: {}'.format(100.0 * sum(correct) / len(correct)))
print('# false positives: {}'.format(false_pos))
print('Processing time: {} seconds'.format(end - start))

# Tested: 118
# Correct: 79
# Wrong: 39
% correct: 66.9491525424
# false positives: 3
Processing time: 75.8179199696 seconds


# Dialogflow

In [65]:
url = 'https://api.dialogflow.com/v1/intents?v=20180910&lang=zh-HK'

headers = {'Authorization': 'Bearer {}'.format('6c839e7ce072446bb3d81b22837d1272'),
           'Accept': 'application/json',
           'Content-Type': 'application/json'}

In [66]:
# Create intents

for intent in variations:
        
    data = {'languageCode': 'zh-HK',
            'name': intent,
            'userSays': [{'data': [{'text': question}]} for question in variations[intent]],
            'auto': True}

    r = requests.post(url, data=json.dumps(data), headers=headers)
    
    time.sleep(1)
    
    if r.status_code != 200:
        print r.text

In [67]:
# Test questions

start = time.time()

correct, pred = [], []
false_pos = 0
for i, question in enumerate(test_df['question']):
    
    url = 'https://api.dialogflow.com/v1/query?v=20180910'
    
    data = {'lang': 'zh-HK',
            'query': question,
            'sessionId': str(i)}
    
    r = requests.post(url, data=json.dumps(data), headers=headers)
    r = json.loads(r.text)
    
    if r['result']['metadata']:
        if r['result']['metadata']['intentName'] == test_df['intent'][i]:
            correct.append(1)
        else:
            if r['result']['score'] > 0.8:
                false_pos += 1
            correct.append(0)
    else:
        correct.append(0)

end = time.time()

print('# Tested: {}'.format(len(correct)))
print('# Correct: {}'.format(sum(correct)))
print('# Wrong: {}'.format(len(correct) - sum(correct)))
print('% correct: {}'.format(100.0 * sum(correct) / len(correct)))
print('# false positives: {}'.format(false_pos))
print('Processing time: {} seconds'.format(end - start))

# Tested: 118
# Correct: 67
# Wrong: 51
% correct: 56.7796610169
# false positives: 0
Processing time: 24.3930449486 seconds


# Watson

In [68]:
from watson_developer_cloud import AssistantV1

In [69]:
assistant = AssistantV1(
    version='2018-09-20',
    iam_apikey='BIScBMKEuhjrKtNVx6QnmyTWyZYpAC_cSO3KjHApwc5R',
    url='https://gateway.watsonplatform.net/assistant/api'
)

In [70]:
# Add intents

for intent in variations:
    r = assistant.create_intent(
        workspace_id='d1e1e46c-5fb3-45a8-8578-d157aa908f1b',
        intent=intent,
        examples=[{'text': question} for question in set(variations[intent])]).get_result()

In [36]:
# Test questions

start = time.time()

correct, pred = [], []
false_pos = 0
for i, question in enumerate(test_df['question']):
    r = assistant.message(
        workspace_id='d1e1e46c-5fb3-45a8-8578-d157aa908f1b',
        input={
            'text': question
        }
    ).get_result()
    
    if r['intents'] and r['intents'][0]['intent'] == test_df['intent'][i]:
        correct.append(1)
    else:
        if r['intents'] and r['intents'][0]['confidence'] > 0.8:
            correct.append(0)
            false_pos += 1
        else:
            correct.append(0)
        
end = time.time()

print('# Tested: {}'.format(len(correct)))
print('# Correct: {}'.format(sum(correct)))
print('# Wrong: {}'.format(len(correct) - sum(correct)))
print('% correct: {}'.format(100.0 * sum(correct) / len(correct)))
print('# false positives: {}'.format(false_pos))
print('Processing time: {} seconds'.format(end - start))

# Tested: 118
# Correct: 77
# Wrong: 41
% correct: 65.2542372881
# false positives: 2
Processing time: 34.7310609818 seconds
