In [1]:
import numpy as np
import pandas as pd
import pickle
import jieba
import json
import re

In [2]:
train_data_path = '.../data/atec_nlp_sim_train.csv'  # 训练数据
train_add_data_path = '../data/atec_nlp_sim_train_add.csv'  # 添加训练数据
stop_words_path = '../data/stop_words.txt'  # 停用词路径
tokenize_dict_path = '../data/dict_all.txt'  # jieba分词新自定义字典
spelling_corrections_path = '../data/spelling_corrections.json'

In [3]:
train_data_df = pd.read_csv(train_data_path, sep='\t', header=None,names=["index", "s1", "s2", "label"])
train_add_data_df = pd.read_csv(train_add_data_path, sep='\t', header=None, names=["index", "s1", "s2", "label"])
train_all = pd.concat([train_data_df, train_add_data_df])

In [4]:
train_all.reset_index(drop=True, inplace=True)

In [5]:
train_all.head()

Unnamed: 0,index,s1,s2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


### 分词及处理

In [6]:
jieba.load_userdict(tokenize_dict_path)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zxq\AppData\Local\Temp\jieba.cache
Loading model cost 1.078 seconds.
Prefix dict has been built succesfully.


In [7]:
# 停用词表
stopwords = [line.strip() for line in open(stop_words_path, 'r', encoding='utf-8').readlines()]

In [8]:
# 拼错词替换表
with open(spelling_corrections_path,"r",encoding="utf-8") as file:
    spelling_corrections = json.load(file)

In [9]:
def transform_other_word(str_text,reg_dict):
    """
    替换词
    :param str_text:待替换的句子
    :param reg_dict:替换词字典
    :return:
    """
    for token_str,replac_str in reg_dict.items():
        str_text = str_text.replace(token_str, replac_str)
    return str_text

In [10]:
def seg_sentence(sentence, stop_words):
    """
    对句子进行分词
    :param sentence:句子，停用词
    """
    sentence_seged = jieba.cut(sentence.strip())
    word_list = [i for i in sentence_seged if i not in stop_words and i != ' ']
    return word_list

In [11]:
def preprocessing_word(s1_train, s2_train, stopwords, spelling_corrections):

    # 去除句子中的脱敏数字***，替换成一
    re_object = re.compile(r'\*+')

    s1_all = []
    s2_all = []
    all = []

    for s1_,s2_ in zip(s1_train, s2_train):
        s1 = re_object.sub(u"十一", s1_)
        s2 = re_object.sub(u"十一", s2_)
        spell_corr_s1 = transform_other_word(s1, spelling_corrections)
        spell_corr_s2 = transform_other_word(s2, spelling_corrections)

        # 分词
        seg_s1 = seg_sentence(spell_corr_s1, stopwords)
        seg_s2 = seg_sentence(spell_corr_s2, stopwords)

        all.extend(seg_s1)
        all.extend(seg_s2)
        s1_all.append(seg_s1)
        s2_all.append(seg_s2)
    source_list = []
    # source_list = list(set(all))
    source_list.append('<UNK>')
    source_list.append('<PAD>')
    source_list.extend(list(set(all)))
    word2id = {}
    id2word = {}
    for index, char in enumerate(source_list):
        word2id[char] = index
        id2word[index] = char

    return s1_all, s2_all, word2id, id2word

In [12]:
s1_train = train_all["s1"].tolist()
s2_train = train_all["s2"].tolist()
y_train = train_all["label"].tolist()

In [13]:
s1_word_all, s2_word_all, word2id, id2word = preprocessing_word(s1_train, s2_train, stopwords, spelling_corrections)

In [14]:
def make_word2id(data, word2id):
    data2id = []
    for word_list in data:
        id_list = [word2id.get(i) if word2id.get(i) is not None else word2id.get('<PAD>') for i in word_list]
        data2id.append(id_list)
    return data2id

In [15]:
def all_data_set(s1_all, s2_all, word2id, y_train, max_l=15):
    pad = word2id['<PAD>']
    all_data = []
    s1_data_id = make_word2id(s1_all, word2id)
    s2_data_id = make_word2id(s2_all, word2id)
    s1_all_new = []
    s2_all_new = []
    y = []
    for i in range(len(s1_data_id)):
        if len(s1_data_id[i]) > max_l:
            s1_set = s1_data_id[i][:max_l]
        else:
            s1_set = np.concatenate((s1_data_id[i], np.tile(pad, max_l - len(s1_data_id[i]))), axis=0)
        if len(s2_data_id[i]) > max_l:
            s2_set = s2_data_id[i][:max_l]
        else:
            s2_set = np.concatenate((s2_data_id[i], np.tile(pad, max_l - len(s2_data_id[i]))), axis=0)
        y_set = [1,0] if y_train[i] == 0 else [0,1]
        s1_all_new.append(s1_set)
        s2_all_new.append(s2_set)
        y.append(y_set)
    return s1_all_new, s2_all_new, y

In [16]:
s1_word_id_all, s2_word_id_all, y_set = all_data_set(s1_word_all, s2_word_all, word2id, y_train, max_l=15)

In [17]:
train_all["s1_word_all"] = s1_word_all

In [18]:
train_all["s2_word_all"] = s2_word_all

In [19]:
train_all["s1_word_id_all"] = s1_word_id_all

In [20]:
train_all["s2_word_id_all"] = s2_word_id_all

In [21]:
train_all["y_set"] = y_set

In [22]:
train_all.tail()

Unnamed: 0,index,s1,s2,label,s1_word_all,s2_word_all,s1_word_id_all,s2_word_id_all,y_set
102472,63127,花呗分期还一期后能用吗,分期是还花呗吗,0,"[花呗, 分期, 一期, 能用]","[分期, 是, 花呗]","[7035, 7398, 4411, 13074, 1, 1, 1, 1, 1, 1, 1,...","[7398, 8573, 7035, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0]"
102473,63128,我的支付宝手机号很花呗手机号不一样怎么办,支付宝上的手机号，怎么和花呗上的不一样,1,"[支付宝, 手机号, 很, 花呗, 手机号, 不, 一样, 怎么办]","[支付宝, 手机号, 怎么, 花呗, 不, 一样]","[6908, 2417, 9236, 7035, 2417, 10236, 12577, 1...","[6908, 2417, 216, 7035, 10236, 12577, 1, 1, 1,...","[0, 1]"
102474,63129,借呗这个月的分期晚几天还可以吗,借呗分期后可以更改分期时间吗,0,"[借呗, 月, 分期, 晚, 几天, 可以]","[借呗, 分期, 可以, 更换, 分期, 时间]","[9420, 8805, 7398, 12104, 9256, 5330, 1, 1, 1,...","[9420, 7398, 5330, 10058, 7398, 2973, 1, 1, 1,...","[1, 0]"
102475,63130,我怎么没有花呗临时额度了,花呗有零时额度吗,0,"[怎么, 没, 花呗, 临时, 额度]","[花呗, 有, 临时, 额度]","[216, 9291, 7035, 760, 6554, 1, 1, 1, 1, 1, 1,...","[7035, 7114, 760, 6554, 1, 1, 1, 1, 1, 1, 1, 1...","[1, 0]"
102476,63131,怎么授权芝麻信用给花呗,花呗授权联系人怎么授权,0,"[怎么, 授权, 信用度, 度, 给, 花呗]","[花呗, 授权, 联系人, 怎么, 授权]","[216, 10193, 9022, 7661, 5728, 7035, 1, 1, 1, ...","[7035, 10193, 1901, 216, 10193, 1, 1, 1, 1, 1,...","[1, 0]"


In [23]:
# 将数据存到一个大列表里面，格式是[[s1,s2,y],[s1,s2,y],[s1,s2,y].......]
all_data = []
for i in range(len(s1_word_id_all)):
    all_data.append([s1_word_id_all[i],s2_word_id_all[i],y_set[i]])

In [24]:
# 将数据存入pickle中
with open("../processed_data/word_data.pk", 'wb') as f1:
    pickle.dump((all_data,word2id,id2word), f1)