# Split Word

## 解决了如下几个问题
- 裁剪词库: 

开源词库特别大,加载慢,如何根据当前的任务裁剪. 因为字典表非常大,比如腾讯开源的词库有8,824,330个词, 解压后有16G. 针对当前的训练数据往往只是很小一部分,在训练时完全加载这么大的数据集是完全没有必要的,可以提前准备一个mini的字典.


- 减少oov的大小:

因为中文有分词的问题, 这样分词结果可能和字典表不匹配,导致oov,但是可以通过变通的办法取近似值. 比如"2019年年底",可以通过"2019" 和 "年底"来取平均值 



# 腾讯词向量下载地址

- https://ai.tencent.com/ailab/nlp/data/Tencent_AILab_ChineseEmbedding.tar.gz



In [1]:
!hostname

ai-prd-05


In [2]:
#Adjust the working folder
import sys
import os
import tqdm
import numpy as np
import pandas as pd
#print(globals())
file_folder = globals()['_dh'][0]
wk_dir = os.path.dirname(file_folder)
os.chdir(wk_dir)


# from code_felix.utils_.util_pandas import *

# from code_felix.feature.category import *
import matplotlib.pyplot as plt
# from code_felix.feature.read_file import _get_transaction, _summary_card_trans_col

#trans_new =  get_trans(trans_new_file)

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

train_file='./input/data_train.csv'
word2vec_model = './Tencent_AILab_ChineseEmbedding.txt'
jieba_dict = './input/jieba.txt'

vector_size=200

In [3]:
%%time

import jieba

print('begin load dit')
#jieba.load_userdict(jieba_dict)
print('end load dit')
text = "欧阳建国是创新办主任也是欢聚时代公司云计算方面的专家"
#text = "真的挺好用, 吃不上"

# jieba.cut() 方法接受两个输入参数: 
# 需要分词的字符串
# cut_all 参数用来控制是否采用全模式

# 精确模式，默认模式就是精确模式
seg_list = jieba.cut(text, cut_all = False)
print('Default Mode:\n' + '/' .join(seg_list))

# 全模式
seg_list = jieba.cut(text, cut_all = True)
print( "Full Mode:\n" + '/' .join(seg_list))

# jieba.cut_for_search() 方法接受一个参数：
# 需要分词的字符串
# 该方法适合用于搜索引擎构建倒排索引的分词，粒度比较细

# 搜索引擎模式
seg_list = jieba.cut_for_search(text)
print('Research Mode:\n' + '/'.join(seg_list))
 
list(jieba.dt.FREQ.keys())[:10]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


begin load dit
end load dit


Loading model cost 0.586 seconds.
Prefix dict has been built succesfully.


Default Mode:
欧阳/建国/是/创新/办/主任/也/是/欢聚/时代/公司/云/计算/方面/的/专家
Full Mode:
欧阳/欧阳建/建国/国是/创新/办/主任/也/是/欢聚/时代/代公/公司/云/计算/方面/的/专家
Research Mode:
欧阳/建国/是/创新/办/主任/也/是/欢聚/时代/公司/云/计算/方面/的/专家
CPU times: user 564 ms, sys: 64.9 ms, total: 629 ms
Wall time: 633 ms


In [4]:
# https://www.kaggle.com/eray1yildiz/using-lstms-with-attention-for-emotion-recognition/notebook
# https://ai.tencent.com/ailab/nlp/embedding.html
#blog : https://blog.csdn.net/sinat_26917383/article/details/83999966

#如何利用词库,增强jieba: https://blog.csdn.net/chinatelecom08/article/details/84588071

## Simple test jieba

## 使用Jieba 对训练数据做分词

In [5]:
%%time

import jieba
train= pd.read_csv(train_file, encoding='gb18030', delimiter='\t', header=None)
train = train[:20]
train.head()
train['jieba'] = train.iloc[:,2].apply(lambda text: ' '.join(jieba.cut(str(text), cut_all = False)))
train['jieba_len'] = train['jieba'].apply(lambda text: len(text.split(' ')))
train.head()

import collections
count = collections.Counter()

for text in train['jieba'].values:
    for word in text.split(' '):
        count[word] +=1

print(len(count))
print(list(count)[:10])

223
['买', '这套', '系统', '本来', '是', '用来', '做', '我们', '公司', '的']
CPU times: user 236 ms, sys: 43.8 ms, total: 280 ms
Wall time: 359 ms


In [6]:
%%time

from file_cache.utils.util_log import *

@timed()
def gen_mini_embedding(wv_from_text, word_list):
    from multiprocessing.dummy import Pool

    from functools import partial

    partition_num = 8
    import math
    partition_length = math.ceil(len(word_list)/partition_num)

    partition_list = [ word_list[i:i+partition_length]  for i in range(0, len(word_list), partition_length )]
    logger.debug(f'The word list split to {len(partition_list)} partitions:{[ len(partition) for partition in partition_list]}')
    thread_pool = Pool(processes=partition_num)
    process = partial(gen_mini_partition,wv_from_text=wv_from_text )

    wv_list = thread_pool.map(process, partition_list)
    thread_pool.close(); thread_pool.join()

    del wv_from_text

    return pd.concat(wv_list)


@timed()
#直接使用KeyedVectors.load_word2vec_format 加载非常慢
def load_embedding(path):
    embedding_index = {}
    f = open(path,encoding='utf8')
    for index,line in enumerate(f):
        if index == 0:
            continue
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:],dtype='float32')
        embedding_index[word] = coefs
    f.close()
    return embedding_index

@timed()
def filter_duplicate_words(file_list):
    import jieba
    jieba.load_userdict(jieba_dict)

    logger.debug(f'load jieba dict:{len(list(jieba.dt.FREQ.keys()))} from file:{jieba_dict}')
    word_count = 0
    word_set = set()
    for cut_all in [True, False]:
        for file in file_list:
            input_text = pd.read_csv(file, encoding='gb18030', delimiter='\t', header=None)
            input_text['jieba'] = input_text.iloc[:, 2].apply(lambda text: ' '.join(jieba.cut(str(text), cut_all)))
            for index, text in enumerate(input_text['jieba'].values):
                for word in text.strip().split(' '):
                    word_set.add(word)
                    word_count += 1
            logger.debug(f'There are {len(word_set)} words after file:{file}')
    logger.debug(f'There are {len(word_set)} word were parser from word_count:{word_count} file_list:{file_list}')
    return sorted(list(word_set))


#@timed()
def gen_mini_partition( word_set,  wv_from_text, local=False):
    if local:
        word_set = word_set[:3000]
        logger.debug("Run app with local model")

    mini = pd.DataFrame( np.zeros((len(word_set), vector_size)),  index=word_set, )
    #for i in tqdm(range(len(word_set))):
    for i in range(len(word_set)):
        word = word_set[i]
        vector = wordVec(word, wv_from_text, 1, 3)
        if vector is not None:
            mini.loc[word] = vector
        else:
            logger.debug(f'Can not find vec for:{len(word)},{word}')
            mini.loc[word] = np.zeros(vector_size)

    return mini
 

def compute_ngrams(word, min_n, max_n):
    # BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
    extended_word = word
    ngrams = []
    for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
        for i in range(0, len(extended_word) - ngram_length + 1):
            ngrams.append(extended_word[i:i + ngram_length])
    res =  list(set(ngrams))
    return res
 
    
#中文特有,因为如果分词总会有一些不再字典里面,并且这个问题会比英语更严重
#但是可以通过ngrams来求平均值来得到一个平均的vector
def wordVec(word,wv_from_text:dict,min_n = 1, max_n = 3):
    '''
    ngrams_single/ngrams_more,主要是为了当出现oov的情况下,最好先不考虑单字词向量
    '''

    # 如果在词典之中，直接返回词向量
    if word in wv_from_text:
        return wv_from_text[word]
    else:  
        word_size = vector_size
        # 计算word的ngrams词组
        ngrams = compute_ngrams(word,min_n = min_n, max_n = max_n)
        # 不在词典的情况下
        word_vec = np.zeros(word_size, dtype=np.float32)
        ngrams_found = 0
        ngrams_single = [ng for ng in ngrams if len(ng) == 1]
        ngrams_more = [ng for ng in ngrams if len(ng) > 1]
        # 先只接受2个单词长度以上的词向量
        for ngram in ngrams_more:
            if ngram in wv_from_text:
                word_vec += wv_from_text[ngram]
                ngrams_found += 1
                #print(ngram)
        # 如果，没有匹配到，那么最后是考虑单个词向量
        if ngrams_found == 0:
            for ngram in ngrams_single:
                if ngram in wv_from_text:
                    word_vec += wv_from_text[ngram]
                    ngrams_found += 1
                elif ngram.lower() in wv_from_text:
                    word_vec += wv_from_text[ngram.lower()]
                    ngrams_found += 1
                else:
                    logger.warning(f'Can not find {ngram} in wv')
        if ngrams_found > 0:
            return word_vec / max(1, ngrams_found)
        else:
            logger.error('all ngrams for word "%s" absent from model' % word)
            return None
 
 

2019-04-09 00:14:26,347 util_log.py[92] INFO Start the program at:ai-prd-05, 173.36.99.83, with:Load module


yes
CPU times: user 176 ms, sys: 52.5 ms, total: 229 ms
Wall time: 487 ms


In [7]:
%%time
embed = load_embedding(word2vec_model)

word_list = filter_duplicate_words([train_file,])


2019-04-09 00:14:26,830 util_log.py[47] INFO load_embedding begin with(1 paras) :['./Tencent_AILab_ChineseEmbedding.txt'], []
2019-04-09 00:20:51,738 util_log.py[62] INFO cost 06.4 min:load_embedding(['./Tencent_AILab_ChineseEmbedding.txt'], []), return:('dict:8824330',), end 
2019-04-09 00:20:51,741 util_log.py[47] INFO filter_duplicate_words begin with(1 paras) :[['./input/data_train.csv']], []
2019-04-09 00:24:19,001 <timed exec>[47] DEBUG load jieba dict:15215979 from file:./input/jieba.txt
2019-04-09 00:24:28,617 <timed exec>[58] DEBUG There are 251218 words after file:./input/data_train.csv
2019-04-09 00:24:44,710 <timed exec>[58] DEBUG There are 256172 words after file:./input/data_train.csv
2019-04-09 00:24:44,711 <timed exec>[59] DEBUG There are 256172 word were parser from word_count:7335836 file_list:['./input/data_train.csv']
2019-04-09 00:24:44,871 util_log.py[62] INFO cost 03.9 min:filter_duplicate_words([['./input/data_train.csv']], []), return:('list:256172',), end 


CPU times: user 9min 2s, sys: 18.9 s, total: 9min 20s
Wall time: 10min 18s


In [8]:
%%time

data = gen_mini_embedding(embed, word_list)
logger.debug(f'The length of the vector is {data.shape}')

fname = "./output/mini.kv"
np.savetxt(fname, data.reset_index().values,
           delimiter=" ",
           header="{} {}".format(len(data), len(data.columns)),
           comments="",
           fmt=["%s"] + ["%.6f"] * len(data.columns))

logger.info(f'Mini dict save to {fname}')

2019-04-09 00:24:44,878 util_log.py[47] INFO gen_mini_embedding begin with(2 paras) :['dict', 'list'], []
2019-04-09 00:24:44,901 <timed exec>[15] DEBUG The word list split to 8 partitions:[32022, 32022, 32022, 32022, 32022, 32022, 32022, 32018]
2019-04-09 00:24:44,972 <timed exec>[130] ERROR all ngrams for word "" absent from model
2019-04-09 00:24:45,017 <timed exec>[77] DEBUG Can not find vec for:0,
2019-04-09 00:24:45,047 <timed exec>[130] ERROR all ngrams for word "	" absent from model
2019-04-09 00:24:45,054 <timed exec>[77] DEBUG Can not find vec for:1,	
2019-04-09 00:24:51,986 <timed exec>[130] ERROR all ngrams for word "《" absent from model
2019-04-09 00:24:52,024 <timed exec>[77] DEBUG Can not find vec for:1,《
2019-04-09 00:25:19,903 <timed exec>[130] ERROR all ngrams for word "，" absent from model
2019-04-09 00:25:19,910 <timed exec>[77] DEBUG Can not find vec for:1,，
2019-04-09 00:25:21,391 util_log.py[62] INFO cost 36.5 sec:gen_mini_embedding(['dict', 'list'], []), return:

CPU times: user 53.8 s, sys: 6.19 s, total: 60 s
Wall time: 58.8 s


In [10]:
%%time
import gensim
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=False)


2019-04-09 00:25:45,107 textcleaner.py[37] INFO 'pattern' package not found; tag filters are not available for English
2019-04-09 00:25:45,114 utils_any2vec.py[170] INFO loading projection weights from ./output/mini.kv
2019-04-09 00:25:45,115 smart_open_lib.py[149] DEBUG {'kw': {}, 'mode': 'rb', 'uri': './output/mini.kv'}
2019-04-09 00:25:45,187 smart_open_lib.py[621] DEBUG encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='./output/mini.kv'>}
2019-04-09 00:26:17,828 utils_any2vec.py[232] INFO loaded (256172, 200) matrix from ./output/mini.kv


CPU times: user 31.3 s, sys: 1.75 s, total: 33.1 s
Wall time: 34.1 s


In [11]:
word_vectors.similar_by_word('探望')

2019-04-09 00:26:17,834 keyedvectors.py[1360] INFO precomputing L2-norms of word weight vectors
  self.vectors_norm = (self.vectors / sqrt((self.vectors ** 2).sum(-1))[..., newaxis]).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


[('看望', 0.9192631244659424),
 ('探视', 0.8276310563087463),
 ('医院探望', 0.7732120156288147),
 ('拜访', 0.7015642523765564),
 ('慰问', 0.7004474401473999),
 ('探亲', 0.6589413285255432),
 ('病重', 0.6578432321548462),
 ('问安', 0.6455398201942444),
 ('病了', 0.6374185085296631),
 ('照料', 0.6373718976974487)]

In [12]:
word_vectors.similar_by_word('系统')

  if np.issubdtype(vec.dtype, np.int):


[('系统中', 0.8185241222381592),
 ('系统对', 0.7760351300239563),
 ('软件系统', 0.7727014422416687),
 ('系统功能', 0.7607237100601196),
 ('管理系统', 0.7511553764343262),
 ('对系统', 0.7509126663208008),
 ('系统提供', 0.7387993335723877),
 ('系统支持', 0.7378956079483032),
 ('系统系统', 0.7339468002319336),
 ('系统的功能', 0.7321740388870239)]

In [13]:
word_vectors['探望']

array([ 0.21462 ,  0.139884,  0.391754,  0.231987,  0.11961 , -0.312598,
       -0.102125,  0.020797,  0.361761, -0.09875 , -0.047344,  0.129789,
        0.219998, -0.407498,  0.161724, -0.118959, -0.192629, -0.294972,
       -0.136594, -0.199864,  0.082437,  0.378111,  0.022619,  0.093166,
        0.104366,  0.571521, -0.243688,  0.446157,  0.359877,  0.052411,
       -0.06698 ,  0.102023,  0.178072,  0.049176, -0.301317, -0.044619,
        0.053053,  0.23609 , -0.085294, -0.035479,  0.335128, -0.326366,
       -0.090003,  0.744669, -0.345959, -0.213992, -0.287085, -0.381243,
        0.087105,  0.069898, -0.175255, -0.233278, -0.172423,  0.10459 ,
        0.283894,  0.354634,  0.032908,  0.140956, -0.14814 ,  0.204862,
       -0.186671,  0.242083,  0.620884,  0.150089,  0.072815, -0.047854,
       -0.078099, -0.216561, -0.162142, -0.011308, -0.762996, -0.124233,
        0.127123,  0.445678, -0.0127  ,  0.008824,  0.343807, -0.144965,
       -0.134015,  0.321934,  0.113197,  0.022872, 

In [19]:
word_vectors.similar_by_word('年底',20)

  if np.issubdtype(vec.dtype, np.int):


[('年末', 0.8267966508865356),
 ('春节前', 0.7716149091720581),
 ('下半年', 0.7590392827987671),
 ('上半年', 0.7561988830566406),
 ('月底', 0.7522507905960083),
 ('年初', 0.7464196681976318),
 ('春节后', 0.7402433156967163),
 ('年关', 0.7323346734046936),
 ('元旦', 0.728956401348114),
 ('2018年', 0.7254775762557983),
 ('今年年底', 0.7181286811828613),
 ('12月份', 0.7129232883453369),
 ('年底前', 0.7124335765838623),
 ('12月底', 0.7045530080795288),
 ('12月', 0.7028988003730774),
 ('临近年底', 0.6996776461601257),
 ('年中', 0.6951011419296265),
 ('明年', 0.6944761872291565),
 ('3月份', 0.6896295547485352),
 ('到春节', 0.6865017414093018)]

In [14]:
list(word_vectors.vocab.keys())[100:110]

['0.75',
 '0.8',
 '0.81',
 '0.83',
 '0.85元',
 '00',
 '0000',
 '00000',
 '0000000',
 '0000000000000000']

In [15]:
word_vectors.wv.vocab.keys()

  if __name__ == '__main__':


dict_keys(['', '\t', '!', '#', '#13', '#14', '#313#', '#315#', '#60616263', '#__&', '$', '%', '%+', '&', "'", '(', ')', '*', '+', '+&', '+1', '+1GB', '+2', '+4', '+Vx', '+_+', ',', '-', '.', '.&', '..', '...', '....', '.....', '......', '......&', '.......', '........', '.........', '..........', '...........', '............', '.............', '..............', '...............', '................', '..................', '...................', '....................', '.....................', '.......................', '..........................', '...................................', '.........................................', '...__', '.._', '._', '.action', '.aspx', '.cn', '.com', '.com.cn', '.htm', '.html', '.net', '.uk', '.y', '.一', '.人', '.价格', '.公司', '.医院', '.安全', '.服务', '.深圳', '.联系方式', '/', '0', '0%', '0&', '0+2', '0.', '0.015', '0.06', '0.11', '0.117', '0.11元', '0.2', '0.24', '0.25', '0.3', '0.35', '0.35元', '0.3年', '0.5', '0.52元', '0.5kg', '0.5立方', '0.6', '0.7', '0.75', '0.8