# Baidu Cloud Chinese Semantic Checker

This notebook is used to check the semantic of Chinese words. It is based on Baidu Cloud's API.

Author: Zexin Xu, Zilu Zhang

## Data Preprocessing

For this dataset only. Do not run this for other datasets. This section is only used for data preprocessing.

* `tunit_df` includes tunits data
* `sen_df` includes sentences data

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('mysen_edit.xlsx')  # read excel file
df.drop(['say'], axis=1, inplace=True)  # drop column
df.dropna(subset=['sentences', 'correct_final'], inplace=True) 
df = df.reset_index(drop=True)
df['sentences'] = df['sentences'].str.replace(r'_x000D_\n', '', regex=True)  # remove _x000D_\n
df['sentences'] = df['sentences'].str.replace(r'\n', '', regex=True)  # remove \n
df.head()
#NOTE Check if there is any empty cell in 'correct' or 'sen'
# df['correct'].isnull().values.any()
# df['sen'].isnull().values.any()

In [None]:
tunit_df = pd.DataFrame({
    'sentence': df['sentences'], 
    'ground_truth_label': df['correct_final']
})
tunit_df.head()

In [None]:
sent = ""
correct = True
sent_arr = []
correct_arr = []
for i, row in df.iterrows():
    sent += row['sentences']
    correct = correct and row['correct_final']
    if row['sen'] == 0:
        sent += "，"
    else:
        sent += "。"
        sent.replace("_x000D_\n", "")
        sent_arr.append(sent)
        correct_arr.append(correct)
        # reset
        sent = ""
        correct = True
        
sen_df = pd.DataFrame({
    'sentence': sent_arr, 
    'ground_truth_label': correct_arr
})
sen_df.head()  

## Baidu API

In [None]:
import requests
import json

API_key = ""
Secret_key = ""


def get_access_token():
    """
    使用 AK，SK 生成鉴权签名（Access Token）
    :return: access_token，或是None(如果错误)
    """
    url = "https://aip.baidubce.com/oauth/2.0/token"
    params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
    return str(requests.post(url, params=params).json().get("access_token"))

access_token = get_access_token()

In [None]:
baidu_tunit_df = sen_df.copy()

url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/ecnet?charset=UTF-8&access_token=" + access_token

headers = {
    'Content-Type': 'application/json',
    'Accept': 'application/json'
}

for i, row in baidu_tunit_df.loc[:, :].iterrows():
    payload = json.dumps({
        "text": row['sentences']
    })
    response = requests.request("POST", url, headers=headers, data=payload).json()
    baidu_tunit_df.loc[i, "纠错内容"] = str(response['item']['vec_fragment'])
    if response['item']['score'] > 0:
        baidu_tunit_df.loc[i, "纠错结果"] = 0
    else:
        baidu_tunit_df.loc[i, "纠错结果"] = 1
    if i % 50 == 0:
        print(i, "iters done...")

baidu_tunit_df.to_csv('baidu/sen_df_result.csv', index=False, encoding='utf-8-sig')

## Result preprocessing

In [6]:
baidu_tunit_df = pd.read_csv('baidu/tunit_df_result.csv', encoding='utf-8-sig')
baidu_sen_df = pd.read_csv('baidu/sen_df_result.csv', encoding='utf-8-sig')

baidu_tunit_df['ground_truth_label'] = tunit_df['ground_truth_label']
baidu_sen_df['ground_truth_label'] = sen_df['ground_truth_label']

baidu_tunit_df.to_csv('baidu/tunit_df_result_mod.csv', index=False, encoding='utf-8-sig')
baidu_sen_df.to_csv('baidu/sen_df_result_mod.csv', index=False, encoding='utf-8-sig')

## Confusion Matrix

In [None]:
def print_evaluation(golds, predictions):
    """
    Prints evaluation statistics comparing golds and predictions, each of which is a sequence of 0/1 labels.
    Prints accuracy as well as precision/recall/F1 of the positive class, which can sometimes be informative if either
    the golds or predictions are highly biased.

    :param golds: gold labels
    :param predictions: pred labels
    :return:
    """
    num_correct = 0
    num_pos_correct = 0
    num_pred = 0
    num_gold = 0
    num_total = 0
    if len(golds) != len(predictions):
        raise Exception("Mismatched gold/pred lengths: %i / %i" % (len(golds), len(predictions)))
    for idx in range(0, len(golds)):
        gold = golds[idx]
        prediction = predictions[idx]
        if prediction == gold:
            num_correct += 1
        if prediction == 1:
            num_pred += 1
        if gold == 1:
            num_gold += 1
        if prediction == 1 and gold == 1:
            num_pos_correct += 1
        num_total += 1
    acc = float(num_correct) / num_total
    output_str = "Accuracy: %i / %i = %f" % (num_correct, num_total, acc)
    prec = float(num_pos_correct) / num_pred if num_pred > 0 else 0.0
    rec = float(num_pos_correct) / num_gold if num_gold > 0 else 0.0
    f1 = 2 * prec * rec / (prec + rec) if prec > 0 and rec > 0 else 0.0
    output_str += ";\nPrecision (fraction of predicted positives that are correct): %i / %i = %f" % (num_pos_correct, num_pred, prec)
    output_str += ";\nRecall (fraction of true positives predicted correctly): %i / %i = %f" % (num_pos_correct, num_gold, rec)
    output_str += ";\nF1 (harmonic mean of precision and recall): %f;\n" % f1
    return output_str

print("------ Tunit Evaluation ------")
print(print_evaluation(baidu_tunit_df['ground_truth_label'], baidu_tunit_df['纠错结果']))
print("------ Sentence Evaluation ------")
print(print_evaluation(baidu_sen_df['ground_truth_label'], baidu_sen_df['纠错结果']))