In [1]:
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
from string import punctuation as p
from collections import defaultdict
import operator
import re
import jieba.analyse
jieba.analyse.set_stop_words('stopwords.txt')

In [18]:
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

def re_word(text):
    text = text.lower()
    
    # remove http URL
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"登陆", "登录", text)
    text = re.sub(r"原件", "组件", text)
    text = re.sub(r"新人", "新员工", text)
    text = re.sub(r"变更", "变动", text)
    text = re.sub(r" +", "", text)
    text = replace_numbers.sub('', text)
    
    return text

In [19]:
jieba.add_word('关联关系', tag='n')
jieba.add_word('工单', tag='n')
jieba.suggest_freq(('亲', '工'), True)
jieba.add_word('云平台', tag='n')
jieba.add_word('全速云', tag='n')
jieba.add_word('云cas', tag='eng')
jieba.add_word('子菜单', tag='n')
jieba.suggest_freq(('对', '子'), True)
jieba.add_word('外网', tag='n')
jieba.add_word('内网', tag='n')
jieba.add_word('功能码', tag='n')
jieba.add_word('线上', tag='a')
jieba.add_word('清楚点', tag='a')
jieba.add_word('一级菜单', tag='n')
jieba.add_word('某一步', tag='m')
jieba.add_word('下一步', tag='m')
jieba.add_word('入职', tag='v')
jieba.add_word('找你', tag='v')
jieba.add_word('找谁', tag='v')
jieba.add_word('找不到', tag='v')
jieba.add_word('通用岗位', tag='v')
jieba.add_word('走流程', tag='v')
jieba.add_word('登录不了', tag='v')
jieba.add_word('被锁', tag='v')
jieba.add_word('没反应', tag='v')
jieba.add_word('挂了', tag='v')
jieba.add_word('登录不上', tag='v')
jieba.add_word('不存在', tag='v')
jieba.add_word('新员工', tag='n')
jieba.add_word('新用户', tag='n')
jieba.add_word('新组件', tag='n')
jieba.add_word('菜单名', tag='n')
jieba.add_word('用户名', tag='n')
jieba.add_word('岗位变动', tag='n')
jieba.add_word('人事变动', tag='n')
jieba.add_word('dns解析', tag='n')
jieba.add_word('wiki地址', tag='n')

In [20]:
df = pd.read_csv('dataset.csv')

In [21]:
df.head()

Unnamed: 0,official,additional
0,请问员工转岗，该怎么申请权限？,请问员工轮岗，该怎么申请权限？
1,请问员工转岗，该怎么申请权限？,我转到别的部门了，怎么添加新部门的权限？
2,请问员工转岗，该怎么申请权限？,转岗了，要怎么开通岗位权限
3,请问员工转岗，该怎么申请权限？,你好，我转到运维部了，请问怎么申请权限
4,请问员工转岗，该怎么申请权限？,岗位变动，权限需要怎么申请


In [22]:
official = df['official'].values
additional = df['additional'].values

In [23]:
def seg_and_tag(arr):
    sentence_with_tag = []
    for line in arr:
        temp_list = []
        stop_p = p + "~·！@#￥%……&*（）——=+-{}【】：；“”‘’《》，。？、|、"
        line = re_word(line)
        line = "".join([c for c in line if c not in stop_p])
        words = pseg.cut(line)
        for word, flag in words:
            temp_list.append("{}/{}".format(word, flag))
        sentence_with_tag.append(" ".join(temp_list))
    return sentence_with_tag

In [24]:
off_list, add_list = seg_and_tag(official), seg_and_tag(additional)

out = np.concatenate([np.expand_dims(off_list, axis=1), np.expand_dims(add_list, axis=1)], axis=1)
dataframe = pd.DataFrame(out, columns=['official', 'additional'])
dataframe.to_csv('train_original.csv', index=False)

print('Done!')

Done!


In [25]:
with open('statistic_original.txt', 'w', encoding='utf-8') as f:
    for line in (off_list + add_list):
        f.write(line + '\n')
        f.flush()
print('Done!')

Done!


# Statistic

In [None]:
with open('statistic_original.txt', 'r', encoding='utf-8') as f:
    dic = defaultdict(int)
    for line in f:
        line = line.strip('\n')
        for word in line.split():
            _, flag = word.split('/')
            dic['/'+flag] += 1
print('Finishe counted!')

In [None]:
count = 0
for key in dic.keys():
    count += 1
print("total_tags : {}".format(count))
for tag, count in dic.items():
    print("{}\t{}".format(tag, count))

## Remove stopwords

In [27]:
stop_words = open('stopwords.txt', 'r', encoding='utf-8')

In [28]:
stops = []
for line in stop_words:
    stops.append("".join(line.split()))

In [29]:
w_file = open('statistic_original_without_stopwords.txt', 'w', encoding='utf-8')

In [30]:
with open('statistic_original.txt', 'r', encoding='utf-8') as f:
    dic = defaultdict(int)
    for line in f:
        line = line.strip('\n')
        write_list = []
        for word in line.split():
            word, flag = word.split('/')
            if word not in stops:
                dic['/'+flag] += 1
                write_list.append("{}/{}".format(word, flag))
            else:
                write_list.append(word)
        w_file.write(" ".join(write_list) + '\n')
print('Finishe counted!')

Finishe counted!


In [31]:
count = 0
for key in dic.keys():
    count += 1
print("total_tags : {}".format(count))
for tag, count in dic.items():
    print("{}\t{}".format(tag, count))
w_file.close()

total_tags : 14
/n	1334
/v	1007
/vn	52
/eng	533
/ns	34
/b	20
/a	34
/m	25
/d	8
/f	12
/nr	12
/c	1
/l	2
/t	1


In [33]:
keys = [key for key in dic.keys() if dic[key] > 2]

In [34]:
keys

['/n', '/v', '/vn', '/eng', '/ns', '/b', '/a', '/m', '/d', '/f', '/nr']

## Write down the tags

In [35]:
for tag in keys:
    name = tag.split('/')[1]
    w_file = open('tags/tag-{}.txt'.format(name), 'w', encoding='utf-8')
    w_file.write(tag + '\n\n')
    dictionary = defaultdict(int)
    with open('statistic_original_without_stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip('\n').split()
            for word in words:
                if tag in word:
                    temp_word = word.split('/')[0]
                    dictionary[temp_word] += 1
    sorted_dict = sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_dict:
        if tag != '/eng':
            if v > 1:
                w_file.write("{}\t{}\n".format(k, v))
        else:
            w_file.write("{}\t{}\n".format(k, v))
print('Done!')

Done!


In [36]:
tags=['a','b','d','eng','f','m','n','nr','ns','v','vn']

# Keyword (Components) extraction

In [37]:
total_sentences = np.concatenate([official, additional], axis=0)
key_word_each_sent = []
for line in total_sentences:
    keywords = jieba.analyse.extract_tags(line, topK=5, allowPOS=tags)
    key_word_each_sent.append(keywords)
    
print('Done!')

Done!


In [38]:
key_word_each_sent

[['转岗', '权限', '员工', '申请'],
 ['转岗', '权限', '员工', '申请'],
 ['转岗', '权限', '员工', '申请'],
 ['转岗', '权限', '员工', '申请'],
 ['转岗', '权限', '员工', '申请'],
 ['岗位', '流程', '变更', '新增', '已有'],
 ['岗位', '流程', '变更', '新增', '已有'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['RMP', '组件', '权限', '流程', '申请'],
 ['wsms', '组件', '权限', '流程', '申请'],
 ['gdui', 'business', '流程', '角色', '申请'],
 ['SI', '权限', '流程', '申请'],
 ['SI', '权限', '流程', '申请'],
 ['SI', '权限', '流程', '申请'],
 ['SI', '权限', '流程', '申请'],
 ['

# Prepare Dataset

In [39]:
write_path = open('../dataset/train.txt', 'w', encoding='utf-8')

In [41]:
with open('statistic_original_without_stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip('\n')
        for word in line.split(' '):
            if '/' in word:
                chars = [c for c in word.split('/')[0]]
                write_path.write(chars[0] + ' B-IPT\n')
                if len(chars) > 1:
                    for i in range(1, len(chars)):
                        write_path.write(chars[i] + ' I-IPT\n')
            else:
                chars = [c for c in word]
                for i in range(len(chars)):
                    write_path.write(chars[i] + ' O\n')
        write_path.write('\n')
print('Done!')

Done!
