In [18]:
import requests
from requests_toolbelt.multipart.encoder import MultipartEncoder
import pickle
from data_generator import DataGenerator
import sentence_from_template as sft
from sklearn.metrics import accuracy_score
import numpy as np
from time import sleep
import uuid
from tqdm.notebook import tqdm

In [2]:
def create_grand_truth(sentence, slots):
    grand_truth = []
    for i, slot in enumerate(slots):
        if slot != "O":
            grand_truth.append((sentence[i], slot))
    return grand_truth

def concat_operator_post(words, slots, word):
    result_word = word
    for i, slot in enumerate(slots):
        if slot == "operator_post":
            result_word = result_word + " " + words[i]
        else:
            break
    return result_word, "operator"

def concat_bnumber_post(words, slots, word):
    result_word = word
    for i, slot in enumerate(slots):
        if slot == "bnumber_post":
            result_word = result_word + " " + words[i]
        else:
            break
    return result_word, "bnumber"

def concat_pnumber_post(words, slots, word):
    result_word = word
    for i, slot in enumerate(slots):
        if slot == "pnumber_post":
            result_word = result_word + " " + words[i]
        else:
            break
    return result_word, "pnumber"

def concat_charge_type_post(words, slots, word):
    result_word = word
    for i, slot in enumerate(slots):
        if slot == "charge_type_post":
            result_word = result_word + " " + words[i]
        else:
            break
    return result_word, "charge_type"

def concat_post(grand_truth):
    words, slots = [ i for i, j in grand_truth ], [ j for i, j in grand_truth ]
    new_words, new_slots = [], []
    for i, slot in enumerate(slots):
        if slot == "operator":
            w, s = concat_operator_post(words[i+1:], slots[i+1:], words[i])
            new_words.append(w)
            new_slots.append(s)
        elif slot == "bnumber":
            w, s = concat_bnumber_post(words[i+1:], slots[i+1:], words[i])
            new_words.append(w)
            new_slots.append(s)
        elif slot == "pnumber":
            w, s = concat_pnumber_post(words[i+1:], slots[i+1:], words[i])
            new_words.append(w)
            new_slots.append(s)
        elif slot == "charge_type":
            w, s = concat_charge_type_post(words[i+1:], slots[i+1:], words[i])
            new_words.append(w)
            new_slots.append(s)
        elif "_post" not in slot:
            new_words.append(words[i])
            new_slots.append(slots[i])
    return new_words, new_slots

In [3]:
labels_conv = {
    "Operatorname": "operator",
    "AMOUNT": "amount",
    "ORIGINACCOUNT": "bnumber",
    "MOBILENUMBER": "pnumber",
    "chargeType": "charge_type",
}

labels_conv_inv = {
    "operator": "Operatorname",
    "amount": "AMOUNT",
    "bnumber": "ORIGINACCOUNT",
    "pnumber": "MOBILENUMBER",
    "charge_type": "chargeType",
}

In [8]:
def create_prediction(pred, gt):
    pred_dict = {}
    for p in pred:
        pred_dict[p["name"]] = p["value"]

    if ("unit" in gt.keys()):
        if "توم" in gt["unit"]:
            gt["amount"] = gt["amount"] + "0"
    # print(pred_dict)
    # print(gt)
    prediction_list, grand_truth_list = [], []
    for k in gt.keys():
        if k == "unit":
            continue
        pred_key = labels_conv_inv[k]
        if (pred_key not in pred_dict.keys()) or (pred_dict[pred_key] == None):
            prediction_list.append("----")
        else:
            prediction_list.append(pred_dict[pred_key])
        grand_truth_list.append(gt[k])

    return grand_truth_list, prediction_list

In [19]:
with open("data.pickle", 'rb') as f:
    data = pickle.load(f)

grand_truths = []
for sentence, slots in zip(data["x"], data["y"]):
    gt = create_grand_truth(sentence, slots)
    new_words, new_slots = concat_post(gt)
    grand_truths.append(dict(list(zip(new_slots, new_words))))


api_url = "https://api.msgata.com/nlu"
accuracies = []
results = []
outputs = []

for i, sentence in tqdm(enumerate(data["x"][:2000])):
    sentence = " ".join(sentence)
    # print(sentence)
    mp_encoder = MultipartEncoder(fields={'text': sentence, "language": "fa", "userId": str(uuid.uuid4())})
    headers = {'Content-Type': mp_encoder.content_type, "devkey": "397124958FA659C9F1A5C7BC96788",
               "token": "12", "Cache-Control": "no-cache",
               "Pragma": "no-cache"}
    res = requests.post(api_url, data=mp_encoder, headers=headers)
    js = res.json()
    result = js['Response']['responses'][0]['entities']
    y, y_pred = create_prediction(result, grand_truths[i])
    results.append((y, y_pred))
    acc = accuracy_score(y, y_pred)
    accuracies.append(acc)
    outputs.append((y, y_pred))
    # print("-----------------")
    # print("")
    # print("")
    # sleep(10)

print(np.mean(accuracies))
print(np.std(accuracies))
print(np.round(np.mean(accuracies), 2))
print(np.round(np.std(accuracies), 2))

0it [00:00, ?it/s]

0.8759
0.16441316857235005


In [52]:
total_slots, missed_slots = 0, 0

for y, y_pred in outputs:
    for i in y:
        total_slots = total_slots + 1
        if i not in y_pred:
            missed_slots = missed_slots + 1

total_slots, total_slots-missed_slots, np.round((total_slots - missed_slots) /  total_slots, 2)

(6460, 5527, 0.86)

In [53]:
with open("output_txt/output.pickle", "wb") as f:
    pickle.dump(outputs, f)

In [45]:
for i, sentence in enumerate(data["x"][:2000]):
    sentence = " ".join(sentence)
    y, y_pred = outputs[i]

    with open("./output_txt/_{0}.txt".format(i), "w", encoding='utf-8') as f:
        lines = ["Sentence: \n", sentence+"\n", "Correct: \n",]
        # f.write('Sentence: \n')
        # f.write(sentence)
        # f.write('\n')
        # f.write('Correct: \n')
        for k in y:
            if k in y_pred:
                lines.append(k + ", ")
                # f.write(k)
                # f.write('\n')
        # f.write('Missed: \n')
        lines.append("\nMissed: \n")
        for k in y:
            if k not in y_pred:
                lines.append(k)
                # f.write(k)
                # f.write('\n')
        print(lines)
        f.writelines(lines)


['Sentence: \n', 'شارژ 6839494 ریالی اینترنت همراه اول واسه خودم میخواستم\n', 'Correct: \n', '6839494, ', 'اینترنت, ', 'همراه اول, ', '\nMissed: \n']
['Sentence: \n', 'واس کانتکت خودم توسط حساب 4455747125666513 شارژ 4590411 تومان همراه اول کنید\n', 'Correct: \n', '45904110, ', 'همراه اول, ', '\nMissed: \n', '4455747125666513']
['Sentence: \n', 'جون مادرت شارژ رایتل اینترنت 8272654 تومن می خوام بوسیله کارت غدچژذ چظخچپطع واس شماره رثو غدنژ رفخذخحض\n', 'Correct: \n', 'رایتل, ', 'اینترنت, ', '82726540, ', 'رثو غدنژ رفخذخحض, ', '\nMissed: \n', 'غدچژذ چظخچپطع']
['Sentence: \n', 'واس مخاطب 91731294043 بوسیله کارت شماره 6663331132434560 شارژ معمولی 8612242 تومانی رایتل بریزین\n', 'Correct: \n', 'معمولی, ', '86122420, ', 'رایتل, ', '\nMissed: \n', '91731294043', '6663331132434560']
['Sentence: \n', 'برای خوم شارژ ایرانسل 5137044 تومنی اینترنت می خری لطفا\n', 'Correct: \n', 'ایرانسل, ', '51370440, ', 'اینترنت, ', '\nMissed: \n']
['Sentence: \n', 'یه شارژ 9474626 تومنی سیم میخری\n', 'Correct: \n'

In [32]:
type(i)

int

In [None]:
print(results[-1])
print(data["x"][3])
print(y)
print(y_pred)

In [54]:


api_url = "https://api.msgata.com/nlu"
mp_encoder = MultipartEncoder(fields={'text': "واس بنده شارژ 15000 تومنی رایتل بخر از حساب 5859831022518434 برای شماره سبا خیاطی",
                                      "language": "fa", "userId": "10"})

# mp_encoder = MultipartEncoder(fields={'text': "خواهش می کنم شارژ خلیج فارس آنلاین اینترنت 8974748 تومن میگیرین به وسیله کارت شماره 3620409980983447 برای مخاطبین 57680905141",
#                                       "language": "fa", "userId": "10"})


# mp_encoder = MultipartEncoder(fields={'text': "واس کانتکت خودم توسط حساب 4455747125666513 شارژ 4590411 تومان همراه اول کنید",
#                                       "language": "fa", "userId": "10"})

mp_encoder = MultipartEncoder(fields={'text': " ".join(data["x"][3]),
                                      "language": "fa", "userId": "10"})

headers = {'Content-Type': mp_encoder.content_type, "devkey": "397124958FA659C9F1A5C7BC96788", "token": "12"}
res = requests.post(api_url, data=mp_encoder, headers=headers)
js = res.json()
js['Response']['responses'][0]['entities']


[{'name': 'chargeType', 'type': 'string', 'value': 'معمولی'},
 {'name': 'id', 'type': 'number', 'value': '1'},
 {'name': 'internet', 'type': 'boolean', 'value': 'false'},
 {'name': 'chargecode', 'type': 'number', 'value': '20'},
 {'name': 'AMOUNT', 'type': 'number', 'value': '86122420'},
 {'name': 'Operatorname', 'type': 'string', 'value': 'رایتل'},
 {'name': 'mobileoperator', 'type': 'string', 'value': 'RIT'},
 {'name': 'operatorId', 'type': 'number', 'value': '6'},
 {'name': 'vat', 'type': 'number', 'value': '0'},
 {'name': 'mobileNumber', 'type': 'number', 'value': None},
 {'name': 'originAccount', 'type': 'number', 'value': None}]

In [None]:
js['Response']['responses'][0]['entities']