In [1]:
from pathlib import Path
import os

ROOT_PATH = Path('.')

VOCAB_PATH = ROOT_PATH.joinpath('data/vocab.txt')

STOP_WORDS_PATH = ROOT_PATH.joinpath('data/哈工大停用词表.txt')

In [2]:
from nlputils.vocab_generator import VocabGenerator
from nlputils.tokenizer import BasicTokenizer, pad_sequence_to_fixed_length

import logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(name)s %(levelname)s %(message)s",
                    datefmt='%Y-%m-%d  %H:%M:%S %a')

tokenizer = BasicTokenizer(language='cn')

gen = VocabGenerator(coverage=1.0)

samples = ['《荒野大镖客：救赎2》拥有一个巨大的开放世界，而且充满活力，不过单人模式下在这个世界中逛久了，总是会感觉有些无聊。于是下面这位玩家Alex Tanaka决定让自己化身为西部大恶人', '他的做法就是绑架游戏中每个郡的治安官，然后在风景宜人的地方与他们玩决斗游戏。决斗的结果他会直接远景截图，当成风景照传到网上，当然他基本只发自己吊打对方的照片。']

In [3]:
sequence = list(range(20))
padded_sequence = pad_sequence_to_fixed_length(sequence, max_length=30)
print(padded_sequence, len(padded_sequence))
padded_sequence = pad_sequence_to_fixed_length(sequence, max_length=30, padding_mode='left')
print(padded_sequence, len(padded_sequence))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]30
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]30


In [4]:
tokenizer.load_stopwords(str(STOP_WORDS_PATH))
list(tokenizer.get_stopwords())[:10]

2020-02-07  11:55:30 Fri nlputils.tokenizer INFO Load stop words from data\哈工大停用词表.txt.


['', '啦', '［②ｉ］', '吓', '照', '是的', '极了', '②ｃ', '\t', '为什么']

In [5]:
string = '我只能 搞笑了'
print(tokenizer.tokenize(string, no_stop_words=False))
print(tokenizer.tokenize(string, no_stop_words=True))

Building prefix dict from the default dictionary ...
2020-02-07  11:55:30 Fri jieba DEBUG Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LAGSAU~1\AppData\Local\Temp\jieba.cache
2020-02-07  11:55:30 Fri jieba DEBUG Loading model from cache C:\Users\LAGSAU~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.241 seconds.
2020-02-07  11:55:31 Fri jieba DEBUG Loading model cost 1.241 seconds.
Prefix dict has been built succesfully.
2020-02-07  11:55:31 Fri jieba DEBUG Prefix dict has been built succesfully.
['我', '只能', ' ', '搞笑', '了']
['只能', '搞笑']


In [9]:
seg_samples = [tokenizer.tokenize(x) for x in samples]
print(seg_samples)

[['《', '荒野', '大', '镖客', '：', '救赎', '2', '》', '拥有', '一个', '巨大', '的', '开放', '世界', '，', '而且', '充满活力', '，', '不过', '单人', '模式', '下', '在', '这个', '世界', '中逛', '久', '了', '，', '总是', '会', '感觉', '有些', '无聊', '。', '于是', '下面', '这位', '玩家', 'Alex', ' ', 'Tanaka', '决定', '让', '自己', '化身为', '西部', '大', '恶人'], ['他', '的', '做法', '就是', '绑架', '游戏', '中', '每个', '郡', '的', '治安', '官', '，', '然后', '在', '风景', '宜人', '的', '地方', '与', '他们', '玩', '决斗', '游戏', '。', '决斗', '的', '结果', '他会', '直接', '远景', '截图', '，', '当成', '风景', '照', '传到', '网上', '，', '当然', '他', '基本', '只发', '自己', '吊打', '对方', '的', '照片', '。']]


In [10]:
gen.generate_vocab(seg_samples)
vocab = gen.get_vocab()
gen.save_vocab_to(str(VOCAB_PATH))
vocab[:20]

['[PAD]',
 '[BOS]',
 '[EOS]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]',
 '的',
 '，',
 '。',
 '大',
 '世界',
 '在',
 '自己',
 '他',
 '游戏',
 '风景',
 '决斗',
 '《',
 '荒野']

In [11]:
tokenizer.load_vocab(vocab)
list(tokenizer.get_vocab())[:20]

['照',
 '无聊',
 '玩家',
 '他会',
 '的',
 '绑架',
 '照片',
 '游戏',
 '有些',
 '每个',
 '宜人',
 '地方',
 '对方',
 '充满活力',
 '[MASK]',
 '决定',
 '基本',
 '。',
 '[SEP]',
 '他']

In [12]:
token2id = tokenizer.get_token2id()
list(token2id.items())

[('[PAD]', 0),
 ('[BOS]', 1),
 ('[EOS]', 2),
 ('[UNK]', 3),
 ('[CLS]', 4),
 ('[SEP]', 5),
 ('[MASK]', 6),
 ('的', 7),
 ('，', 8),
 ('。', 9),
 ('大', 10),
 ('世界', 11),
 ('在', 12),
 ('自己', 13),
 ('他', 14),
 ('游戏', 15),
 ('风景', 16),
 ('决斗', 17),
 ('《', 18),
 ('荒野', 19),
 ('镖客', 20),
 ('：', 21),
 ('救赎', 22),
 ('2', 23),
 ('》', 24),
 ('拥有', 25),
 ('一个', 26),
 ('巨大', 27),
 ('开放', 28),
 ('而且', 29),
 ('充满活力', 30),
 ('不过', 31),
 ('单人', 32),
 ('模式', 33),
 ('下', 34),
 ('这个', 35),
 ('中逛', 36),
 ('久', 37),
 ('了', 38),
 ('总是', 39),
 ('会', 40),
 ('感觉', 41),
 ('有些', 42),
 ('无聊', 43),
 ('于是', 44),
 ('下面', 45),
 ('这位', 46),
 ('玩家', 47),
 ('Alex', 48),
 (' ', 49),
 ('Tanaka', 50),
 ('决定', 51),
 ('让', 52),
 ('化身为', 53),
 ('西部', 54),
 ('恶人', 55),
 ('做法', 56),
 ('就是', 57),
 ('绑架', 58),
 ('中', 59),
 ('每个', 60),
 ('郡', 61),
 ('治安', 62),
 ('官', 63),
 ('然后', 64),
 ('宜人', 65),
 ('地方', 66),
 ('与', 67),
 ('他们', 68),
 ('玩', 69),
 ('结果', 70),
 ('他会', 71),
 ('直接', 72),
 ('远景', 73),
 ('截图', 74),
 ('当成', 75),
 ('照', 76),


In [13]:
list(tokenizer.get_id2token().items())[:10]

[(0, '[PAD]'),
 (1, '[BOS]'),
 (2, '[EOS]'),
 (3, '[UNK]'),
 (4, '[CLS]'),
 (5, '[SEP]'),
 (6, '[MASK]'),
 (7, '的'),
 (8, '，'),
 (9, '。')]

In [14]:
ids1 = tokenizer.convert_tokens_to_ids(seg_samples[0])
ids2 = tokenizer.encode(samples[0])
print(ids1)
print(ids2)
ids1 == ids2

[18, 19, 10, 20, 21, 22, 23, 24, 25, 26, 27, 7, 28, 11, 8, 29, 30, 8, 31, 32, 33, 34, 12, 35, 11, 36, 37, 38, 8, 39, 40, 41, 42, 43, 9, 44, 45, 46, 47, 48, 49, 50, 51, 52, 13, 53, 54, 10, 55]
[18, 19, 10, 20, 21, 22, 23, 24, 25, 26, 27, 7, 28, 11, 8, 29, 30, 8, 31, 32, 33, 34, 12, 35, 11, 36, 37, 38, 8, 39, 40, 41, 42, 43, 9, 44, 45, 46, 47, 48, 49, 50, 51, 52, 13, 53, 54, 10, 55]


True

In [15]:
print(tokenizer.encode(samples[0], max_length=16, truncate_mode='right'))
print(tokenizer.encode(samples[0], max_length=16, truncate_mode='left'))

[18, 19, 10, 20, 21, 22, 23, 24, 25, 26, 27, 7, 28, 11, 8, 29]
[43, 9, 44, 45, 46, 47, 48, 49, 50, 51, 52, 13, 53, 54, 10, 55]


In [16]:
print(tokenizer.encode(samples[0], max_length=70, padding_mode='right'))
print(tokenizer.encode(samples[0], max_length=70, padding_mode='left'))

[18, 19, 10, 20, 21, 22, 23, 24, 25, 26, 27, 7, 28, 11, 8, 29, 30, 8, 31, 32, 33, 34, 12, 35, 11, 36, 37, 38, 8, 39, 40, 41, 42, 43, 9, 44, 45, 46, 47, 48, 49, 50, 51, 52, 13, 53, 54, 10, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 19, 10, 20, 21, 22, 23, 24, 25, 26, 27, 7, 28, 11, 8, 29, 30, 8, 31, 32, 33, 34, 12, 35, 11, 36, 37, 38, 8, 39, 40, 41, 42, 43, 9, 44, 45, 46, 47, 48, 49, 50, 51, 52, 13, 53, 54, 10, 55]


In [17]:
print(tokenizer.decode(ids1))
print(tokenizer.decode(ids2))

《荒野大镖客：救赎2》拥有一个巨大的开放世界，而且充满活力，不过单人模式下在这个世界中逛久了，总是会感觉有些无聊。于是下面这位玩家Alex Tanaka决定让自己化身为西部大恶人
《荒野大镖客：救赎2》拥有一个巨大的开放世界，而且充满活力，不过单人模式下在这个世界中逛久了，总是会感觉有些无聊。于是下面这位玩家Alex Tanaka决定让自己化身为西部大恶人


In [18]:
sample = '你好啊小老弟。'
ids = tokenizer.encode(sample)
print(ids)
print(tokenizer.decode(ids))

[3, 3, 3, 9]
[UNK][UNK][UNK]。
