-
Notifications
You must be signed in to change notification settings - Fork 1
/
tools.py
145 lines (112 loc) · 4.3 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import random
from record import Record
import pandas as pd
import requests
import json
import os
from statics import email, api_key
def load_stop_words(path=os.path.realpath(os.path.join(__file__, '../persian-stopwords.txt'))):
stopwords_file = open(path, 'r', encoding='utf8')
stopwords = stopwords_file.read().split('\n')
stopwords_file.close()
return stopwords
def get_clusters(algorithm, corpus, number_of_clusters):
"""
:param algorithm: a function that get a path and return the clusters
:param corpus: an array of documents
:param number_of_clusters: number of clusters that should be make
:return: a list of Cluster objects
"""
return algorithm(corpus, number_of_clusters)
def make_corpus(records):
return [r.a_pre for r in records]
createUrl = "http://api.diaalog.ir/chatbot/intents/create?user=" + email + "&api_key=" + api_key
buildUrl = "http://api.diaalog.ir/chatbot/build?user=" + email + "&api_key=" + api_key
def make_dialog(records, title):
speechResponse = "{{["
trainingData = []
for rec in records:
speechResponse += '"' + rec.a_pre + '"'
trainingData.append({"text": rec.q_pre, "entities": []})
if records[-1] is not rec:
speechResponse += ','
speechResponse += "]|random}}"
print('speechResponse', speechResponse)
print('trainingData', trainingData)
data = {'apiTrigger': False,
'botName': email,
'intentId': title + '_dialog_' + email,
'labeledSentences': [],
'name': title,
'parameters': [],
'speechResponse': speechResponse,
'trainingData': "",
'userDefined': True
}
r = requests.post(url=createUrl, json=data)
print('create :', r)
json_response = json.loads(r.content)
print(json_response)
id = json_response['_id']
print('ID is :', id)
trainAPI = "http://api.diaalog.ir/chatbot/train/" + id + "?user=" + email + "&api_key=" + api_key
data2 = trainingData
r2 = requests.post(url=trainAPI, json=data2)
print('Train :', r2)
json_response2 = json.loads(r2.content)
print(json_response2)
r3 = requests.post(url=buildUrl, json={})
print('build:', r3)
json_response3 = json.loads(r3.content)
print(json_response3)
def build():
r2 = requests.post(url=buildUrl, json={})
print('build', r2)
def delete_dialog(id, do_build=True):
delete = "http://api.diaalog.ir/chatbot/intents/" + id + "?user=" + email + "&api_key=" + api_key
r = requests.delete(url=delete)
print('delete :', r)
if do_build:
build()
def delete_all_dialogs():
r = requests.get(url="http://api.diaalog.ir/chatbot/intents?user=" + email + "&api_key=" + api_key)
print(r)
json_res = json.loads(r.content)
id_list = [lis['_id']['$oid'] for lis in json_res]
for i in range(3, len(id_list)):
delete_dialog(id_list[i], False)
build()
def get_answer(clusters, test_string, distance_algorithm, is_lda=False):
min_dist = 999999999
best_cluster_index = -1
for i, c in enumerate(clusters):
records = c.get_records()
for r in records:
dist = distance_algorithm(test_string, r.q_pre)
if dist < min_dist:
min_dist = dist
best_cluster_index = i
best_cluster_records = clusters[best_cluster_index].get_records()
if is_lda:
max_records_index = int(len(best_cluster_records) * .7)
else:
max_records_index = len(best_cluster_records) - 1
return best_cluster_records[random.randint(0, max_records_index)].a_raw
def make_records(df_pre, df_raw):
records = []
for each_row in range(df_pre.shape[0]):
records.append(Record(
str(df_raw.iat[each_row, 0]),
str(df_raw.iat[each_row, 1]),
str(df_pre.iat[each_row, 0]),
str(df_pre.iat[each_row, 1])))
return records
def read_data(data_path):
df_pre = pd.read_excel(data_path, sheet_name='preprocessed')
df_raw = pd.read_excel(data_path, sheet_name='Raw')
return df_pre, df_raw
def divide_train_test(records, train_percent):
random.shuffle(records)
train_records = records[:int(train_percent * len(records))]
test_records = records[int(train_percent * len(records)):]
return train_records, test_records