### 分类统计

In [2]:
import pandas as pd
from collections import Counter

tr_data_path = 'train.json'
data = pd.read_json(tr_data_path)
data.shape

(79676, 3)

In [3]:
count_res = Counter(data['action_cause'])
freq_res = sorted([(x[0], x[1] / 79676) for x in list(count_res.items())], key=lambda x: x[1], reverse=True)
top_freq = sum([x[1] for x in freq_res[:10]])
tail_freq = sum([x[1] for x in freq_res[-50:]])
print(top_freq, tail_freq)

0.6743310407148952 0.00931271650183243


In [6]:
criminal_data_path = '../criminal/train.json'
data_cri = pd.read_json(criminal_data_path)
data_cri['first_charge'] = data_cri['charge'].apply(lambda x: x[0])
count_res = Counter(data_cri['first_charge'])
freq_res = sorted([(x[0], x[1] / 79676) for x in list(count_res.items())], key=lambda x: x[1], reverse=True)
top_freq = sum([x[1] for x in freq_res[:10]])
tail_freq = sum([x[1] for x in freq_res[-50:]])
print(top_freq, tail_freq)

0.8046965209096841 0.00719162608564687


### 合并生成字典

In [10]:
file_list = ['D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\THUOCL_caijing.txt',
             'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\THUOCL_diming.txt',
             'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\THUOCL_law.txt']

word_set = set()
for current_file in file_list:
    with open(current_file, 'r', encoding='utf-8') as file:
        for line in file.readlines():
            item = line.split('\t')[0].strip()
            word_set.add(item)

word_set.remove('')

print(len(word_set))
new_file = 'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\user_dict.txt'
with open(new_file, 'w', encoding='utf-8') as file:
    for word in word_set:
        file.write(word+'\n')

58350


### 合并停用词表

In [19]:
stopword_list = ['D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\baidu_stopwords.txt',
             'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\cn_stopwords.txt',
             'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\scu_stopwords.txt',
             'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\hit_stopwords.txt']

word_set = set()
for current_file in stopword_list:
    with open(current_file, 'r', encoding='utf-8') as file:
        for line in file.readlines():
            word_set.add(line)

print(len(word_set))
new_file = 'D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\user_stopword.txt'
with open(new_file, 'w', encoding='utf-8') as file:
    for word in word_set:
        file.write(word)

2317


### 分词

In [16]:
import pandas as pd
from collections import Counter

tr_data_path = 'train.json'
data = pd.read_json(tr_data_path)
data.head(1)

Unnamed: 0,fact,laws,action_cause
0,原告诉称，2017年8月27日，三原告亲属杨明红（已死亡，身份证号码：）因子宫肌瘤到被告处就...,"[[{'title': '中华人民共和国民法通则', 'date': {'year': 20...",医疗损害责任纠纷


In [18]:
# 对于train文件大概需要40分钟进行分词
from pkuseg import pkuseg
seg_model = pkuseg(user_dict='D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\user_dict.txt')
data['words'] = data['fact'].apply(lambda x: seg_model.cut(x))
data['words'].head()

0    [原告, 诉称, ，, 2017年, 8月, 27日, ，, 三, 原告, 亲属, 杨, 明...
1    [原告, 诉称, ，, 1999年, 底, ，, 原, 、, 被告, 经人, 介绍, 相识,...
2    [原告, （, 反诉, 被告, ）, 钱, 洪, 、, 程, 玲玲, 向, 本院, 在, 本...
3    [原告, 中国农业银行, 股份有限公司, 弥勒市, 支行, 诉称, ：, 2013年, 9月...
4    [许葵, 向, 本院, 提出, 诉讼, 请求, ：, 1, 、, 判令, 被告, 立即, 偿...
Name: words, dtype: object

In [20]:
# 统计出低频词并查看高频词
# 713mins 
from collections import Counter
word_frequency = dict(Counter(sum(data['words'],[])))

ValueError: not enough values to unpack (expected 2, got 1)

In [48]:
low_freq = set()
for (word, freq) in zip(word_frequency.keys(), word_frequency.values()):
    if freq <= 3:
        low_freq.add(word)
print(len(low_freq))

word_freq_sorted = sorted(word_frequency,reverse=True)
print(word_freq_sorted[:10])
print(list(low_freq)[:20])

476453
['￥￥', '￥．930000', '￥．8500', '￥．698000', '￥．668000', '￥﹤', '￥陆', '￥贰拾万', '￥贰', '￥肆']
['2147.20', '东500', '吴展勇', '汪涛负', '8011772', '冀雅', '65284', '任玉婷', '232806', '141690', '东德信', '欣春苑', '酶Ⅲ', '24364', '慰斗厂', '怀望', '79649.66', '输入端', '斛圩', '兰传']


In [50]:
# 停用词与低频词过滤
stopword_dict = set()
with open('D:\\sjx\\RUC研究生\\毕业论文\\code\\dicts\\user_stopword.txt', 'r', encoding='utf-8') as file:
    for line in file.readlines():
        stopword_dict.add(line)

def need_to_filter(item):
    return item in stopword_dict or item in low_freq
data['words_filtered'] = data['words'].apply(lambda x: [item for item in x if (not need_to_filter(item))])

In [54]:
def illegal_char(item):
    if len(item) == 0:
        return True
    for char in item:
        if char.isdigit() or not ('\u4e00' <= char <= '\u9fff'):
            return True
    return False
data['words_filtered_nums'] = data['words_filtered'].apply(lambda x: [item for item in x if (not illegal_char(item))])

In [56]:
data['input_sentence'] = data['words_filtered_nums'].apply(lambda x: " ".join(x))
data.loc[:,['input_sentence','action_cause']] .to_csv('./data/train_wordseq.csv', sep = '|')

### 整理为读入模型的数据

In [1]:
import pandas as pd
data = pd.read_csv('./data/train_wordseq.csv', encoding='utf-8', sep = '|')
data = data[['input_sentence','action_cause']]
data.head(1)

Unnamed: 0,input_sentence,action_cause
0,原告 诉称 三 原告 亲属 杨 明红 已 死亡 身份证 号码 因 子宫 肌瘤 到 被告处 就...,医疗损害责任纠纷


In [2]:
data['sentence_length'] = data['input_sentence'].apply(lambda x: len(x.split(' ')))
data['sentence_length'].describe()

count    79676.000000
mean       544.263454
std        500.104900
min          9.000000
25%        236.000000
50%        404.000000
75%        682.000000
max      15677.000000
Name: sentence_length, dtype: float64

In [6]:
data['action_cause'].value_counts()

民间借贷纠纷                    17031
机动车交通事故责任纠纷                9079
金融借款合同纠纷                   6461
离婚纠纷                       6253
买卖合同纠纷                     6226
                          ...  
监护权纠纷                        12
建设工程监理合同纠纷                   12
土地承包经营权互换合同纠纷                12
船舶抵押合同纠纷                     12
认定公民无民事行为能力、限制民事行为能力案件       11
Name: action_cause, Length: 257, dtype: int64