## 数据处理脚本

In [2]:
import os
import pandas as pd
import json
import numpy as np
import random
from copy import copy
from tqdm import tqdm
from data import preProcess
import random
data_path = 'data'

In [2]:
df = pd.read_excel(os.path.join(data_path, '事项分类.xlsx'), sheet_name='字典')
print(df.columns.tolist())
label_cols = ['一级', '二级', '三级', '四级', '五级', '六级']
label_set = {}
for row in tqdm(df.index.values):
    last_label = str(df.loc[row, '末级'])
    labels = []
    for lv in label_cols:
        if not pd.isnull(df.loc[row, lv]):
            labels.append(str(df.loc[row, lv]))
    if last_label in label_set:
        print(row, last_label)
        print(label_set[last_label])
    else:
        label_set[last_label] = labels

['末级', '一级', '二级', '三级', '四级', '五级', '六级']


100%|██████████| 1091/1091 [00:00<00:00, 10547.10it/s]

1074 其他
['政风行风党风', '违法违纪', '其他']
1090 其他
['政风行风党风', '违法违纪', '其他']





In [None]:
df1 = pd.read_excel(os.path.join(data_path, '2022底数.xlsx'), sheet_name='Sheet1')
df2 = pd.read_excel(os.path.join(data_path, '2022底数.xlsx'), sheet_name='Sheet1(2)', names=df1.columns.tolist())
df1.columns.tolist()

In [None]:
df3 = df1.dropna(axis=0, how='all')
df4 = df2.dropna(axis=0, how='all')
print(len(df3), len(df4))
df = pd.concat([df3, df4], ignore_index=True)
print(len(df))

1048575 302014
1350589


In [19]:
class LabelTreeNode:
    def __init__(self, label) -> None:
        self.label = label
        self.children = {}

    def add_child(self, label):
        if(label in list(self.children.keys())):
            return
        else:
            self.children[label] = LabelTreeNode(label)

    def get_child_by_label(self, label):
        if label in list(self.children.keys()):
            return self.children[label]
        else:
            return None
        
def dfs_write_label(cur_node: LabelTreeNode, cur_label, filestream):
    cur_label += cur_node.label
    filestream.write(cur_label + '\n')
    cur_label += '##'
    for label in cur_node.children.keys():
        dfs_write_label(cur_node.children[label], cur_label, filestream)

def dfs_get_label(target, cur_node: LabelTreeNode, cur_label, label_list) -> bool:
    cur_label += cur_node.label
    if cur_node.label == target:
        label_list.append(cur_label)
        return True
    
    next_label = cur_label
    next_label += '##'
    
    for label in cur_node.children.keys():
        if dfs_get_label(target, cur_node.children[label], next_label, label_list):
            label_list.append(cur_label)
            return True
    
    return False
    

In [7]:
from config import config
total_item = []
label_tree = LabelTreeNode('root')
max_sequnce_len = 0
total_len = 0

for row in tqdm(df.index.values):
    id = str(df.loc[row, 'SERIALNUM'])
    title = str(df.loc[row, '标题'])
    text = preProcess('' if pd.isnull(df.loc[row, '内容']) else str(df.loc[row, '内容']))
    max_sequnce_len = max(max_sequnce_len, len(text))
    total_len += len(text)
    
    last_label = str(df.loc[row, 'ZEROLABEL'])
    if last_label not in label_set:
        continue
    labels = label_set[last_label]
    total_item.append({
        'title': title,
        'text': text,
        'label': labels
    })

    curNode = label_tree
    for label in labels:
        curNode.add_child(label)
        curNode = curNode.get_child_by_label(label)

100%|██████████| 1350589/1350589 [01:20<00:00, 16837.37it/s]


In [12]:
label_tree.children.keys()

dict_keys(['城乡建设', '劳动社保', '市场监管', '经济财贸', '其他', '农林牧渔', '交通运输', '公共安全', '公卫医疗', '民政社区', '政风行风党风', '科教文体', '自然资源与环境保护', '司法行政'])

In [8]:
print('最大句子长度：', max_sequnce_len)
print('平均句子长度：', total_len / len(df.index.values))

最大句子长度： 1418
平均句子长度： 170.40746000448692


In [26]:
label_list = []
target = '新冠'
for label in label_tree.children.keys():
    if dfs_get_label(target, label_tree.children[label], '', label_list):
        break
label_list.reverse()
print(label_list)

['公卫医疗', '公卫医疗##公共卫生', '公卫医疗##公共卫生##新冠']


In [24]:
# 写label.txt
curNode = label_tree
with open(os.path.join(config.path.data, 'label.txt'), 'w') as f:
    for label in curNode.children.keys():
        dfs_write_label(curNode.children[label], '', f)


In [29]:
train_item = []
dev_item = []
test_item = []
random.shuffle(total_item)

In [30]:
need_test_set = False
if need_test_set:
    train_item = total_item[:len(total_item) * 8 // 10]
    dev_item = total_item[len(total_item) * 8 // 10: len(total_item) * 9 // 10]
    test_item = total_item[len(total_item) * 9 // 10:]
    print('划分成 训练集-样本{}条，验证集-样本{}条，测试集-样本{}条'.format(len(train_item), len(dev_item), len(test_item)))
else:
    train_item = total_item[:len(total_item) * 8 // 10]
    dev_item = total_item[len(total_item) * 9 // 10:]
    print('划分成 训练集-样本{}条，验证集-样本{}条'.format(len(train_item), len(dev_item)))

划分成 训练集-样本825836条，验证集-样本103230条


In [34]:
with open(os.path.join(config.path.data, 'train.txt'), 'w') as f:
    for item in tqdm(train_item):
        if item['text'] == '':
            continue
        f.write(item['title'] + ':' + item['text'] + '\t')
        label_list = []
        target = item['label'][-1]
        for label in label_tree.children.keys():
            if dfs_get_label(target, label_tree.children[label], '', label_list):
                break
        label_list.reverse()
        for label in label_list:
            f.write(label + ('\n' if label == label_list[-1] else ','))
        f.write('\n')


  0%|          | 978/825836 [00:00<02:51, 4814.94it/s]

100%|██████████| 825836/825836 [02:52<00:00, 4789.24it/s]


In [35]:

with open(os.path.join(config.path.data, 'dev.txt'), 'w') as f:
    for item in tqdm(dev_item):
        if item['text'] == '':
            continue
        f.write(item['title'] + ':' + item['text'] + '\t')
        label_list = []
        target = item['label'][-1]
        for label in label_tree.children.keys():
            if dfs_get_label(target, label_tree.children[label], '', label_list):
                break
        label_list.reverse()
        for label in label_list:
            f.write(label + ('\n' if label == label_list[-1] else ','))


100%|██████████| 103230/103230 [00:21<00:00, 4770.96it/s]


In [None]:

if need_test_set:
    with open(os.path.join(config.path.data, 'test.txt'), 'w') as f:
        for item in test_item:
            if item['text'] == '':
                continue
            f.write(
                item['title'] + item['text'] + '\t' +\
                item['label_1'] + ',' + \
                item['label_1'] + '##' + item['label_2'] + ',' + \
                item['label_1'] + '##' + item['label_2'] + '##' + item['label_3'] + ',' + \
                item['label_1'] + '##' + item['label_2'] + '##' + item['label_3'] + '##' + item['label_4'] + '\n'
            )

### 使用2023的数据生成测试集

In [4]:
df = pd.read_excel(os.path.join(data_path, '2023样本0904.xlsx'))
total_item = []
max_sequnce_len = 0
total_len = 0

for row in tqdm(df.index.values):
    id = str(df.loc[row, '工单编号'])
    title = str(df.loc[row, '标题'])
    text = preProcess('' if pd.isnull(df.loc[row, '内容']) else str(df.loc[row, '内容']))
    max_sequnce_len = max(max_sequnce_len, len(text))
    total_len += len(text)

    total_item.append({
        'id': id,
        'title': title,
        'text': text,
        'ori_text': str(df.loc[row, '内容'])
    })

100%|██████████| 865381/865381 [00:45<00:00, 19029.45it/s]


In [5]:
print('最大句子长度：', max_sequnce_len)
print('平均句子长度：', total_len / len(df.index.values))

最大句子长度： 2379
平均句子长度： 169.9272540071945


In [None]:
with open('data/data.txt', 'w') as f:
    for item in total_item:
        if item['text'] == '':
            continue
        f.write(item['title'] + ':' + item['text'] + '\n')

In [7]:
ids = []
titles = []
ori_tests = []
texts = []
for item in total_item:
    ids.append(item['id'])
    titles.append(item['title'])
    ori_tests.append(item['ori_text'])
    texts.append(item['title'] + ':' + item['text'])
df = pd.DataFrame({
    '工单编号': ids,
    '标题': titles,
    '内容': ori_tests,
    '处理后文本': texts 
})

In [9]:
df.to_excel('./results/result2023.xlsx', index=None)