# 生成词汇表与数据编码

In [1]:
import os
import codecs
import collections
from operator import itemgetter

In [2]:
zh_file = "./data/en-zh/train.trg.zh"
en_trg_file = "./data/en-zh/train.trg.en-zh.en"
DATA_ROOT = "./data"
TRIAN_DIR = os.path.join(DATA_ROOT, "train")
if not os.path.exists(TRIAN_DIR):
    os.makedirs(TRIAN_DIR)

In [3]:
ZH_VOCAB_SIZE = 4000
ZH_VOCAB_OUTPUT = os.path.join(TRIAN_DIR, "zh.vocab")
ZH_RAW = os.path.join(TRIAN_DIR, "zh.raw")
ZH_TRAIN = os.path.join(TRIAN_DIR, "zh.train")

EN_VOCAB_SIZE = 10000
EN_VOCAB_OUTPUT = os.path.join(TRIAN_DIR, "en.vocab")
EN_RAW = os.path.join(TRIAN_DIR, "en.raw")
EN_TRAIN = os.path.join(TRIAN_DIR, "en.train")

In [13]:
def build_sorted_words(file_path):
    """按空格分词、按词频统计得到一个排序的列表"""
    counter = collections.Counter()
    with codecs.open(file_path, "r", "utf-8") as f:
        for line in f:
            for word in line.strip().split():
                counter[word] += 1

    # 按词频顺序对单词进行排序。
    sorted_word_to_cnt = sorted(
        counter.items(), key=itemgetter(1), reverse=True)
    sorted_words = [x[0] for x in sorted_word_to_cnt]
    return sorted_words


def bulid_vocab(sorted_words, vocab_size, file_path):
    """构建词汇表，输出到词汇表文件"""
    sorted_words = ["<unk>", "<sos>", "<eos>"] + sorted_words
    if len(sorted_words) >= vocab_size:
        sorted_words = sorted_words[:vocab_size]
    with codecs.open(file_path, 'w', 'utf-8') as f:
        for word in sorted_words:
            f.write(word + "\n")
    print("Build {} done.".format(file_path))
    
    
class Tokenizer(object):
    """解析器"""
    def __init__(self, vocab_file):
        self.vocab_file = vocab_file
        self.vocab_list = self.load_vocab()
        self.word2idx = self.build_word2idx()
        self.idx2word = self.bulid_idx2word()
        
    def load_vocab(self):
        vocab = []
        with open(self.vocab_file, "r") as f:
            for word in f:
                vocab.append(word.strip())
        return vocab
    
    def build_word2idx(self):
        word2idx = {w:i for i, w in enumerate(self.vocab_list)}
        return word2idx
    
    def bulid_idx2word(self):
        idx2word = {i:w for i, w in enumerate(self.vocab_list)}
        return idx2word
    
    def wtoi(self, word_list):
        idx_list = []
        for word in word_list:
            if word not in self.word2idx:
                idx = self.word2idx.get("<unk>")
            else:
                idx = self.word2idx.get(word)
            idx_list.append(idx)
        return idx_list
        
    def itow(self, idx_list):
        word_list = []
        for idx in idx_list:
            word = self.idx2word.get(idx)
            word_list.append(word)
        return word_list
    
def zh_to_list(sentence):
    """中文句子转列表"""
    li = [w.strip() for w in sentence.strip()]
    return li

def en_to_list(sentence):
    """英文句子转列表"""
    return sentence.strip().split()

In [5]:
# words = build_sorted_words(en_trg_file)

In [5]:
## 处理中文
words = build_sorted_words(zh_file)

In [6]:
bulid_vocab(words, ZH_VOCAB_SIZE, ZH_VOCAB_OUTPUT)

Build ./data/train/zh.vocab done.


In [11]:
tokenizer = Tokenizer(ZH_VOCAB_OUTPUT)

In [15]:
s = "我爱北京天安门"

In [16]:
tokenizer.wtoi(zh_to_list(s))

[5, 397, 843, 1836, 126, 376, 634]