# 语病检测 - 科大讯飞

Author: Zexin Xu, Zilu Zhang

In [1]:
import pandas as pd

## 数据预处理
For this dataset only. Do not run this for other datasets.

* `tunit_df` includes tunits data
* `sen_df` includes sentences data

In [None]:
df = pd.read_excel('mysen_edit.xlsx')  # read excel file
df.drop(['say'], axis=1, inplace=True)  # drop column
df.dropna(subset=['sentences', 'correct_final'], inplace=True) # drop empty rows
df = df.reset_index(drop=True)
df['sentences'] = df['sentences'].str.replace(r'_x000D_\n', '', regex=True)  # remove _x000D_\n
df['sentences'] = df['sentences'].str.replace(r'\n', '', regex=True)  # remove \n
df.head()

#NOTE Check if there is any empty cell in 'correct' or 'sen'
# df['correct'].isnull().values.any()
# df['sen'].isnull().values.any()

In [None]:
tunit_df = pd.DataFrame({
    'sentence': df['sentences'], 
    'ground_truth_label': df['correct_final']
})
tunit_df.head()

In [None]:
sent = ""
correct = True
sent_arr = []
correct_arr = []
for i, row in df.iterrows():
    sent += row['sentences']
    correct = correct and row['correct_final']
    if row['sen'] == 0:
        sent += "，"
    else:
        sent += "。"
        sent.replace("_x000D_\n", "")
        sent_arr.append(sent)
        correct_arr.append(correct)
        # reset
        sent = ""
        correct = True
        
sen_df = pd.DataFrame({
    'sentence': sent_arr, 
    'ground_truth_label': correct_arr
})
sen_df.head()   

## 科大讯飞API调用

* Credit: https://www.xfyun.cn/doc/nlp/textCorrection/API.html#%E8%BF%94%E5%9B%9E%E7%BB%93%E6%9E%9C

In [14]:
# -*- coding:utf-8 -*-
from datetime import datetime
from wsgiref.handlers import format_date_time
from time import mktime
import hashlib
import base64
import hmac
from urllib.parse import urlencode
import json
import requests


class AssembleHeaderException(Exception):
    def __init__(self, msg):
        self.message = msg


class Url:
    def __init__(this, host, path, schema):
        this.host = host
        this.path = path
        this.schema = schema
        pass


class WebsocketDemo:
    def __init__(self,APPId,APISecret,APIKey,Text):
        self.appid = APPId
        self.apisecret = APISecret
        self.apikey = APIKey
        self.text = Text
        self.url = 'https://api.xf-yun.com/v1/private/s9a87e3ec'

    # calculate sha256 and encode to base64
    def sha256base64(self,data):
        sha256 = hashlib.sha256()
        sha256.update(data)
        digest = base64.b64encode(sha256.digest()).decode(encoding='utf-8')
        return digest


    def parse_url(self,requset_url):
        stidx = requset_url.index("://")
        host = requset_url[stidx + 3:]
        schema = requset_url[:stidx + 3]
        edidx = host.index("/")
        if edidx <= 0:
            raise AssembleHeaderException("invalid request url:" + requset_url)
        path = host[edidx:]
        host = host[:edidx]
        u = Url(host, path, schema)
        return u


    # build websocket auth request url
    def assemble_ws_auth_url(self,requset_url, method="POST", api_key="", api_secret=""):
        u = self.parse_url(requset_url)
        host = u.host
        path = u.path
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))
        #print(date)
        # date = "Thu, 12 Dec 2019 01:57:27 GMT"
        signature_origin = "host: {}\ndate: {}\n{} {} HTTP/1.1".format(host, date, method, path)
        #print(signature_origin)
        signature_sha = hmac.new(api_secret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')
        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            api_key, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        #print(authorization_origin)
        values = {
            "host": host,
            "date": date,
            "authorization": authorization
        }

        return requset_url + "?" + urlencode(values)


    def get_body(self):
        body =  {
            "header": {
                "app_id": self.appid,
                "status": 3,
                #"uid":"your_uid"
            },
            "parameter": {
                "s9a87e3ec": {
                    #"res_id":"your_res_id",
                    "result": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "input": {
                    "encoding": "utf8",
                    "compress": "raw",
                    "format": "plain",
                    "status": 3,
                    "text": base64.b64encode(self.text.encode("utf-8")).decode('utf-8')
                }
            }
        }
        return body

    def get_result(self):
        request_url = self.assemble_ws_auth_url(self.url, "POST", self.apikey, self.apisecret)
        headers = {'content-type': "application/json", 'host':'api.xf-yun.com', 'app_id':self.appid}
        body = self.get_body()
        response = requests.post(request_url, data = json.dumps(body), headers = headers)
        # print('onMessage：\n' + response.content.decode())
        tempResult = json.loads(response.content.decode())
        # print('text字段解析：\n' + base64.b64decode(tempResult['payload']['result']['text']).decode())
        return json.loads(base64.b64decode(tempResult['payload']['result']['text']).decode())


In [None]:
def get_result(result):
    if len(result) > 0:
        print(str(result))
        return str(result)
    else:
        return pd.NA
    
def generate_result(df):
    APPId = ""
    APISecret = ""
    APIKey = ""

    for i, row in df.loc[:, :].iterrows():
        demo = WebsocketDemo(APPId, APISecret, APIKey, row['sentences'])
        result = demo.get_result()
        df.loc[i, "政治术语纠错"] = get_result(result['pol']) 
        df.loc[i, "别字纠错"] = get_result(result['char']) 
        df.loc[i, "别词纠错"] = get_result(result['word'])
        df.loc[i, "语法纠错-冗余"] = get_result(result['redund']) 
        df.loc[i, "语法纠错-缺失"] = get_result(result['miss']) 
        df.loc[i, "语法纠错-乱序"] = get_result(result['order']) 
        df.loc[i, "搭配纠错"] = get_result(result['dapei']) 
        df.loc[i, "标点纠错"] = get_result(result['punc']) 
        df.loc[i, "成语纠错"] = get_result(result['idm']) 
        df.loc[i, "机构名纠错"] = get_result(result['org']) 
        df.loc[i, "领导人职称纠错"] = get_result(result['leader']) 
        df.loc[i, "数字纠错"] = get_result(result['number']) 
        df.loc[i, "地名纠错"] = get_result(result['addr'])
        df.loc[i, "全文人名纠错"] = get_result(result['name']) 
        df.loc[i, "句式杂糅/语义重复"] = get_result(result['grammar_pc']) 
        if i % 100 == 0:
            print(i, "iters done...")

kd_tunit_df = tunit_df.copy()
generate_result(kd_tunit_df)

In [None]:
kd_tunit_df = tunit_df.copy()
generate_result(kd_tunit_df)
kd_tunit_df.to_csv('kedaxunfei/tunit_df_result.csv', index=False, encoding='utf-8-sig')

In [None]:
kd_sen_df = sen_df.copy()
generate_result(kd_sen_df)
kd_sen_df.to_csv('kedaxunfei/sen_df_result.csv', index=False, encoding='utf-8-sig')

## Result processsing

In [11]:
kd_tunit_result = pd.read_csv('kedaxunfei/tunit_df_result.csv', encoding='utf-8')
kd_sen_result = pd.read_csv('kedaxunfei/sen_df_result.csv', encoding='utf-8')

def result_processing(df, tru_df):
    df['纠错数'] = 15 - df.apply(lambda x: x.isnull().sum(), axis='columns')
    for i, row in df.iterrows():
        df.loc[i, 'prediction_label'] = False if row['纠错数'] > 0 else True
    df['ground_truth_label'] = tru_df['ground_truth_label']
    
result_processing(kd_tunit_result, tunit_df)
kd_tunit_result.to_csv('kedaxunfei/tunit_df_result_mod.csv', index=False, encoding='utf-8-sig')
result_processing(kd_sen_result, sen_df)
kd_sen_result.to_csv('kedaxunfei/sen_df_result_mod.csv', index=False, encoding='utf-8-sig')

## Confusion Matrix 混淆矩阵

In [None]:
def print_evaluation(golds, predictions):
    """
    Prints evaluation statistics comparing golds and predictions, each of which is a sequence of 0/1 labels.
    Prints accuracy as well as precision/recall/F1 of the positive class, which can sometimes be informative if either
    the golds or predictions are highly biased.

    :param golds: gold labels
    :param predictions: pred labels
    :return:
    """
    num_correct = 0
    num_pos_correct = 0
    num_pred = 0
    num_gold = 0
    num_total = 0
    if len(golds) != len(predictions):
        raise Exception("Mismatched gold/pred lengths: %i / %i" % (len(golds), len(predictions)))
    for idx in range(0, len(golds)):
        gold = golds[idx]
        prediction = predictions[idx]
        if prediction == gold:
            num_correct += 1
        if prediction == 1:
            num_pred += 1
        if gold == 1:
            num_gold += 1
        if prediction == 1 and gold == 1:
            num_pos_correct += 1
        num_total += 1
    acc = float(num_correct) / num_total
    output_str = "Accuracy: %i / %i = %f" % (num_correct, num_total, acc)
    prec = float(num_pos_correct) / num_pred if num_pred > 0 else 0.0
    rec = float(num_pos_correct) / num_gold if num_gold > 0 else 0.0
    f1 = 2 * prec * rec / (prec + rec) if prec > 0 and rec > 0 else 0.0
    output_str += ";\nPrecision (fraction of predicted positives that are correct): %i / %i = %f" % (num_pos_correct, num_pred, prec)
    output_str += ";\nRecall (fraction of true positives predicted correctly): %i / %i = %f" % (num_pos_correct, num_gold, rec)
    output_str += ";\nF1 (harmonic mean of precision and recall): %f;\n" % f1
    return output_str

kd_tunit_result = pd.read_csv('kedaxunfei/tunit_df_result_mod.csv', encoding='utf-8')
kd_sen_result = pd.read_csv('kedaxunfei/sen_df_result_mod.csv', encoding='utf-8')
print("------ Tunit Evaluation ------")
print(print_evaluation(kd_tunit_result['ground_truth_label'], kd_tunit_result['prediction_label']))
print("------ Sentence Evaluation ------")
print(print_evaluation(kd_sen_result['ground_truth_label'], kd_sen_result['prediction_label']))