In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils.utils import clean_data


class Preprocessor:
    def __init__(self):

        self.df = pd.DataFrame()        #加载时 暂存数据
        self.train_data, self.trian_label = None, None
        self.train_x, self.train_y, self.val_x, self.val_y = None, None, None, None

        
        self._load_xlsx_files()



    def _load_xlsx_files(self):
        # 加载数据, 并划分 数据集
        nums_resume = 0
        for sub_dir in os.listdir('./data/resume_data/'):
            cur_dir = os.path.join('./data/resume_data/', sub_dir)
            if not os.path.isdir(cur_dir):
                continue
            for filename in os.listdir(cur_dir):
                file_path = os.path.join(cur_dir, filename)
                if os.path.isfile(file_path):
                    nums_resume += 1
                    self.read_xlsx(file_path, nums_resume)
        self.df = self.df.reset_index(drop=True)

        orig_data = self.df
        self.train_data, self.trian_label = self._parse_orig_data(orig_data)

        
        # 划分 数据集                                                                                        
        self.train_x, self.val_x, self.train_y, self.val_y = train_test_split(
                                                                self.train_data, 
                                                                self.trian_label, 
                                                                test_size=0.2, 
                                                                random_state=0)


    def read_xlsx(self, file, id):
        data = pd.read_excel(file, header=None, names=['resume', 'label_detail'])
        data['resume_id'] = np.full(data.shape[0], id)
        data['resume'] = data['resume'].astype(str)
        data['label_detail'] = data['label_detail'].astype(str)
        data['label'] = data['label_detail'].str.split("-").str.get(0)
        self.df = pd.concat([self.df, data])

    def _parse_orig_data(self, orig_data):

        train_data = orig_data['resume']
        label = orig_data['label'].values
        train_data = clean_data(train_data).values         # 数据清洗
        return train_data, label



In [2]:
raw = Preprocessor()
            

In [4]:
train_data = raw.train_data

In [5]:
train_data.size

127188

In [6]:
import jieba
import os

filename = os.listdir('./data/word_dictionaries/')
for file in filename:
    path = os.path.join('./data/word_dictionaries/', file)
    jieba.load_userdict(path)


res = []
for line in train_data:
    seg_list = jieba.cut(line, use_paddle=True)
    res.append(' '.join(seg_list))
res = pd.Series(res).astype(str)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/km/12m35ksj2q5fvp5_wrw_r5lr0000gn/T/jieba.cache
Loading model cost 0.876 seconds.
Prefix dict has been built successfully.


In [10]:
res.to_csv('./seg_word/seg_data.csv', sep=',', header=False, index=False, )

In [11]:
f = pd.read_csv('./seg_word/seg_data.csv', sep=',',header=None).astype(np.str_)

In [21]:
resume = np.array(res, dtype=np.str_)

In [20]:
np.array(f, dtype=np.str_)

array([['简历'],
       ['d3e2aa8d52990af80HR92ti0F1dY3429UqaQuejnw'],
       ['余刚'],
       ...,
       ['一种 使用 信号 突然 消失 和 出现 判断 一船 多码 的 系统'],
       ['教育 经历'],
       ['bookmarkGoBack201009201406 唐山 师范学院 电子信息科学与技术 本科 统招']],
      dtype='<U4352')

In [8]:
resume = f[0].to_list()

0                                                        简历
1                 d3e2aa8d52990af80HR92ti0F1dY3429UqaQuejnw
2                                                        余刚
3             188106641721250725373 @ qqcom   男生 日   199109
4         武汉   离职 随时 到 岗     6 年 工作 经验   求职意向   Java    ...
                                ...                        
127183                            船舶 航线 识别方法 装置 电子设备 及 存储介质
127184                          船期 相似性 判断 方法 装置 电子设备 及 存储介质
127185                    一种 使用 信号 突然 消失 和 出现 判断 一船 多码 的 系统
127186                                                教育 经历
127187    bookmark   GoBack201009     201406   唐山 师范学院  ...
Length: 127188, dtype: object

In [23]:
resume[10].split(' ')

['锁', 'postman', '正则表达式', 'elasticjobxxljobfiddlerjmeter', '多线程', 'sftp', '等']

In [37]:
vocab = []
for line in resume:
    words = line.split(' ')
    vocab.extend(words)
vocab = list(set(vocab))

In [38]:
import sys

sys.path.append('..')
from text2vec import SentenceModel, EncoderType
from text2vec import Word2Vec

def compute_emb(model, all_embedding):
    # Embed a list of sentences
    sentences = vocab
    sentence_embeddings = model.encode(sentences)
    print(type(sentence_embeddings), sentence_embeddings.shape)

    # The result is a list of sentence embeddings as numpy arrays
    for sentence, embedding in zip(sentences, sentence_embeddings):
        all_embedding[sentence] = embedding
        # print("Sentence:", sentence)
        # print("Embedding shape:", embedding.shape)
        # print("Embedding head:", embedding[:10])
        # print()


if __name__ == "__main__":
    # 中文句向量模型(CoSENT)，中文语义匹配任务推荐，支持fine-tune继续训练
    # t2v_model = SentenceModel("shibing624/text2vec-base-chinese",
    #                           encoder_type=EncoderType.FIRST_LAST_AVG)
    # compute_emb(t2v_model)


    # # 支持多语言的句向量模型（Sentence-BERT），英文语义匹配任务推荐，支持fine-tune继续训练
    # sbert_model = SentenceModel("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    #                             encoder_type=EncoderType.MEAN)
    # compute_emb(sbert_model)

    # 中文词向量模型(word2vec)，中文字面匹配任务和冷启动适用
    all_embedding = {}
    w2v_model = Word2Vec("w2v-light-tencent-chinese")
    compute_emb(w2v_model, all_embedding)

  from tqdm.autonotebook import tqdm
2022-08-10 22:11:33.036 | INFO     | text2vec.word2vec:__init__:79 - Load pretrained model:w2v-light-tencent-chinese, path:/Users/soul/.text2vec/datasets/light_Tencent_AILab_ChineseEmbedding.bin
2022-08-10 22:11:34.687 | DEBUG    | text2vec.word2vec:__init__:92 - Load w2v from /Users/soul/.text2vec/datasets/light_Tencent_AILab_ChineseEmbedding.bin, spend 1.65 sec


<class 'numpy.ndarray'> (57943, 200)


In [40]:
df = pd.DataFrame(data=all_embedding, )

In [44]:
df.to_csv('./seg_word/word2vec.csv', sep='\n')

In [178]:
specialchars = ['<pad>', '<unk>']           #特殊标记？？？
vocab = specialchars + list(all_embedding.keys())  #所有词
vocab_size = len(vocab)
embedding_matrix = np.zeros((vocab_size, 200))

#特殊标记， 以均匀分布 产生 词向量
for token in specialchars:
    all_embedding[token] = np.random.uniform(low=-1, high=1, size=(200))

word2idx = {}
idx2word = {}
# 建立 wordidx 和 idx2word  对应关系
for index, word in enumerate(vocab):
    word2idx[word] = index
    idx2word[index] = word
    embedding_matrix[index] = all_embedding[word]      #导入

In [None]:
import gensim.models.word2vec
Word2Vec

In [98]:
import re


def delelem(data):
    """
    删除多余的标点符号   出现的无用空格   不出现中文字符的数据行
    """
    res_del = []

    for line in data:
        # line = str(line)
        # 使用空字符替换掉间隔符
        a = re.sub(r'\s', '', line)

        # 使用精准匹配，匹配连续出现的符号;并用空字符替换他
        b = re.sub(r'\W{2,}', '', a)

        # 使用空字符替换空格
        c = re.sub(r' ', '', b)

        # 删除没有中文的数据行
        if len(re.findall(r"[\u4e00-\u9fa5]", c)) >= 2:
            res_del.append(c)

    return res_del

In [99]:
def mulunusechar(data):
    """
    生成 特殊符号
    """
    unuse_lis = []

    rule_1 = r'\W'  # 匹配非英文和非数字和非中文
    compiled_rule_1 = re.compile(rule_1)      

    for line in data:
            no_en_and_da = compiled_rule_1.findall(line)
            no_en_and_da_str = ''.join(no_en_and_da)
            # 只保留 逗号 和 句号
            reslis = re.findall(r'^\S', ''.join(re.findall(r'[^\，]', ''.join(re.findall(r'[^\。]', no_en_and_da_str)))))
            unuse_lis.append(reslis)        
    res = []
    for i in unuse_lis:
        for j in i:
            res.append(j)
    res = set(res)
    return res


In [100]:
def sympop(data,sym):
    """
    删除指定的特殊符号
    :param data: list
    :param sym: list or path_of_sym_in_txt
    :return: list
    """
    #获得特殊符号的列表
    if type(sym)==list:
        symlist=sym
    else:
        symlist = list(sym)

    res=[]
    for line in data:
        for j in range(len(symlist)):     
            line = line.replace(symlist[j],'')    
        res.append(line)

    return res