# CCKS2022基于知识图谱的优质文章识别

In [None]:
%pip install -r requirements.txt

## 零、数据预处理

In [None]:
import pandas as pd
import numpy as np

train_df = pd.read_json('data/ccks2022/train.json', encoding='utf8', lines=True)
test_df = pd.read_json('data/ccks2022/test.unlabel.json', encoding='utf8', lines=True)

#----------------特征工程----------------
#将Topic(Label)编码
train_df['label'], lbl = pd.factorize(train_df['label'])

#将论文的标题与摘要组合为 text 特征
train_df['title'] = train_df['title'].apply(lambda x: x.strip())
train_df['content'] = train_df['content'].fillna('').apply(lambda x: x.strip())
train_df['text'] = train_df['title'] + '[SEP]' + train_df['content']
train_df['text'] = train_df['text'].str.lower()

test_df['title'] = test_df['title'].apply(lambda x: x.strip())
test_df['content'] = test_df['content'].fillna('').apply(lambda x: x.strip())
test_df['text'] = test_df['title'].str.lower() + '[SEP]' + test_df['content'].str.lower()

for i in range(train_df.shape[0]):
    train_df['text'].iloc[i] = train_df['text'].iloc[i].replace('  ', '').replace('\n', ' ').replace('<br/>', ' ').replace("《", " ").replace("》", " ").replace("？", " ").replace("【", "").replace("】", "")

for i in range(test_df.shape[0]):
    test_df['text'].iloc[i] = test_df['text'].iloc[i].replace('  ', '').replace('\n', ' ').replace('<br/>', ' ').replace("《", " ").replace("》", " ").replace("？", " ").replace("【", "").replace("】", "")

In [None]:
train_out = pd.concat([train_df.drop(['url', 'title', 'pub_time', 'content', 'entities'], axis=1), train_df.iloc[:, -3:-2]], axis=1)
train_out.to_csv('data/ccks2022/train.tsv', encoding='utf8', sep='\t', index=None)
test_out = pd.concat([test_df.drop(['url', 'title', 'pub_time', 'content', 'entities'], axis=1), test_df.iloc[:, -2:-1]], axis=1)
test_out.to_csv('data/ccks2022/test.tsv', encoding='utf8', sep='\t', index=None)
train_out

In [None]:
# 加入实体
import json

train_entities = []

for i in range(train_df['entities'].shape[0]):
    entities = ""
    for key, value in train_df['entities'][i].items():
        if len(entities) >= 256:
            continue
        entities += "[SEP]" + key
    train_entities.append(entities)

train = train_out.drop(['entities'], axis=1)

for i in range(train.shape[0]):
    train['text'].iloc[i] = train['text'].iloc[i][:256] + train_entities[i]

train.to_csv('data/ccks2022/train.tsv', encoding='utf8', sep='\t', index=None)
train

In [None]:
test_entities = []
test_entity_nums = []

for i in range(test_df['entities'].shape[0]):
    cnt = 0
    entities = ""
    for key, value in test_df['entities'][i].items():
        cnt += 1
        if len(entities + "[SEP]" + key) >= 256:
            continue
        entities += "[SEP]" + key
    test_entities.append(entities)
    test_entity_nums.append(cnt)

test = test_out.drop(['entities'], axis=1)

for i in range(test.shape[0]):
    test['text'].iloc[i] = test['text'].iloc[i][:256] + test_entities[i]

test.to_csv('data/ccks2022/test.tsv', encoding='utf8', sep='\t', index=None)
test

In [None]:
train_for_pretraining = train.drop(['label'], axis=1)

data_for_pretraining = pd.concat([train_for_pretraining, test], axis=1)
data_for_pretraining.to_csv('data/ccks2022/data_for_pretraining/train.tsv', encoding='utf8', index=None)

In [None]:
# 分割训练集和测试集
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train, test_size=0.2, random_state=2022)
train_data.to_csv('data/ccks2022/train.tsv', encoding='utf8', sep='\t', index=None)
val_data.to_csv('data/ccks2022/dev.tsv', encoding='utf8', sep='\t', index=None)
test.to_csv('data/ccks2022/test.tsv', encoding='utf8', index=None)
len(train_data), len(val_data), len(test)

## 一、继续预训练

在预训练完成后将输出目录下的 `pytorch_model.pth.ep27` 重命名为 `pytorch_model.bin`，并将其与 `bert_config.json` 文件一起移动到包含 `vocab.txt` 的 `ccks2022/pretrained_models/nezha-base-wwm-pretrained/` 目录下。
> 注：已经放置了预训练好的模型在该目录下

In [None]:
!set CUDA_VISIBLE_DEVICES=0,1
!python ccks2022_for_pretraining.py \
  --task_name=text-clf \
  --do_train \
  --data_dir=data/ccks2022/data_for_pretraining/ \
  --bert_model=pretrained_models/nezha-base-wwm \
  --max_seq_length=512 \
  --train_batch_size=28 \
  --learning_rate=3e-5  \
  --num_train_epochs=27.0 \
  --output_dir=output/ccks2022/continue_pretraining/1/

## 二、微调训练

#### (1.1) 非 K 折训练 —— GRU to Attention

In [None]:
!set CUDA_VISIBLE_DEVICES=0,1
!python ccks2022_classifier.py \
  --task_name=text-clf \
  --do_train \
  --do_eval \
  --data_dir=data/ccks2022/ \
  --bert_model=pretrained_models/nezha-base-wwm \
  --max_seq_length=512 \
  --train_batch_size=32 \
  --eval_batch_size=32 \
  --learning_rate=3e-5  \
  --num_train_epochs=5.0 \
  --model=1 \
  --output_dir=output/ccks2022/1/

#### (1.2) 非 K 折的推理预测 —— GRU to Attention

In [None]:
!python ccks2022_pred.py \
  --data_dir=data/ccks2022/ \
  --trained_model_dir=trained_models/ccks2022/ \
  --output_dir=output/ccks2022/1/predict/ \
  --eval_batch_size=64 \
  --max_seq_length=512

#### (1.3) 非 K 折训练 —— Attention to GRU

In [None]:
!set CUDA_VISIBLE_DEVICES=0,1
!python ccks2022_classifier.py \
  --task_name=text-clf \
  --do_train \
  --do_eval \
  --data_dir=data/ccks2022/ \
  --bert_model=pretrained_models/nezha-base-wwm \
  --max_seq_length=512 \
  --train_batch_size=32 \
  --eval_batch_size=32 \
  --learning_rate=3e-5  \
  --num_train_epochs=7.0 \
  --model=2 \
  --output_dir=output/ccks2022/2/

#### (1.4) 非 K 折的推理预测 —— Attention to GRU

In [None]:
!python ccks2022_pred.py \
  --data_dir=data/ccks2022/ \
  --trained_model_dir=trained_models/ccks2022/ \
  --output_dir=output/ccks2022/2/predict/ \
  --eval_batch_size=64 \
  --max_seq_length=512

#### (2) 5 折训练

In [None]:
## !set CUDA_VISIBLE_DEVICES=0,1
!python ccks2022_classifier_kfold.py \
  --task_name=text-clf \
  --do_train \
  --data_dir=data/ccks2022/ \
  --bert_model=pretrained_models/nezha-base-wwm-pretrained \
  --max_seq_length=512 \
  --train_batch_size=30 \
  --learning_rate=4e-5  \
  --num_train_epochs=6.0 \
  --output_dir=output/ccks2022/kfold/1/

# 三、结果融合与数据增强

In [None]:
import pandas as np

df1 = pd.read_csv('./output/ccks2022/1/predict/test_results.txt', encoding='utf8', header=None, names=['label'])
df2 = pd.read_csv('./output/ccks2022/2/predict/test_results.txt', encoding='utf8', header=None, names=['label'])
df3 = pd.read_csv('./output/ccks2022/kfold/1/test_results_2.txt', encoding='utf8', header=None, names=['label'])

df_out = pd.DataFrame()
df_temp = pd.DataFrame()
df_out.insert(0, 'label', df1['label'])
df_temp.insert(0, 'label', 0)

assert len(df_out) == len(df1)

df_temp['label'] = (df1['label'] + df2['label'] + df3['label'])
df_out[df_temp['label'] >= 2] = 1
df_out[df_temp['label'] < 2] = 0

for i in range(len(test_entity_nums)):
    if test_entity_nums[i] >= 50:
        df_out['label'].iloc[i] = 1
    elif test_entity_nums[i] <= 12:
        df_out['label'].iloc[i] = 0

df_out

## 四、结果输出

In [None]:
import json
import pandas as pd

df = pd.read_json('data/ccks2022/test.unlabel.json', encoding='utf8', lines=True)
labels = df_out
res = ""
for i in range(df.shape[0]):
    dic = dict()
    dic['url'] = df.iloc[i, :]["url"]
    dic['label'] = int(labels.iloc[i, :]["label"])
    res += json.dumps(dic, ensure_ascii=False) + "\n"
    if i == 0:
        print(res[:100])
with open("result/result.txt", encoding="utf-8", mode='w') as f:
    f.write(res)
    f.close()