# 生成词汇表与数据编码

In [1]:
import os
import codecs
import collections
from operator import itemgetter

In [2]:
zh_file = "./data/train/train.trg.zh"  # 中文字空格分割
en_file = "./data/train/train.trg.en"  # 英文单词空格分割
DATA_ROOT = "./data"
TRIAN_DIR = os.path.join(DATA_ROOT, "train")
if not os.path.exists(TRIAN_DIR):
    os.makedirs(TRIAN_DIR)

In [3]:
PAD, UNK, BOS, EOS  = "<pad>", "<unk>", "<bos>", "<eos>"
N_PAD, N_UNK, N_BOS, N_EOS = 0, 1, 2, 3

ZH_VOCAB_SIZE = 4000
ZH_VOCAB_OUTPUT = os.path.join(TRIAN_DIR, "zh.vocab")
ZH_DEV = os.path.join(TRIAN_DIR, "zh.dev")
ZH_TRAIN = os.path.join(TRIAN_DIR, "zh.train")

EN_VOCAB_SIZE = 10000
EN_VOCAB_OUTPUT = os.path.join(TRIAN_DIR, "en.vocab")
EN_DEV = os.path.join(TRIAN_DIR, "en.dev")   # 这个暂时没用？
EN_TRAIN = os.path.join(TRIAN_DIR, "en.train")

In [6]:
def build_sorted_words(file_path):
    """按空格分词、按词频统计得到一个排序的列表"""
    counter = collections.Counter()
    with codecs.open(file_path, "r", "utf-8") as f:
        for line in f:
            for word in line.strip().split():
                counter[word] += 1

    # 按词频顺序对单词进行排序。
    sorted_word_to_cnt = sorted(
        counter.items(), key=itemgetter(1), reverse=True)
    sorted_words = [x[0] for x in sorted_word_to_cnt]
    return sorted_words


def bulid_vocab(sorted_words, vocab_size, file_path):
    """构建词汇表，输出到词汇表文件"""
    sorted_words = [PAD, UNK, BOS, EOS] + sorted_words
    if len(sorted_words) >= vocab_size:
        sorted_words = sorted_words[:vocab_size]
    with codecs.open(file_path, 'w', 'utf-8') as f:
        for word in sorted_words:
            f.write(word + "\n")
    print("Build {} done.".format(file_path))
    
    
class Tokenizer(object):
    """解析器"""
    def __init__(self, vocab_file):
        self.vocab_file = vocab_file
        self.vocab_list = self.load_vocab()
        self.word2idx = self.build_word2idx()
        self.idx2word = self.bulid_idx2word()
        
    def load_vocab(self):
        vocab = []
        with open(self.vocab_file, "r") as f:
            for word in f:
                vocab.append(word.strip())
        return vocab
    
    def build_word2idx(self):
        word2idx = {w:i for i, w in enumerate(self.vocab_list)}
        return word2idx
    
    def bulid_idx2word(self):
        idx2word = {i:w for i, w in enumerate(self.vocab_list)}
        return idx2word
    
    def wtoi(self, word_list):
        idx_list = []
        for word in word_list:
            if word not in self.word2idx:
                idx = self.word2idx.get(UNK)
            else:
                idx = self.word2idx.get(word)
            idx_list.append(idx)
        return idx_list
        
    def itow(self, idx_list):
        word_list = []
        for idx in idx_list:
            word = self.idx2word.get(idx)
            word_list.append(word)
        return word_list
    
def zh_to_list(sentence):
    """中文句子转列表"""
    li = [w.strip() for w in sentence.strip()]
    return li

def en_to_list(sentence):
    """英文句子转列表"""
    return sentence.strip().split()

def split_datafile(src_file, train_file, dev_file, vocab_file, nsplit):
    """划分训练集和测试集并写入文件"""
    tokenizer = Tokenizer(vocab_file)
    with open(src_file, "r") as f1, open(train_file, "w") as f2, \
        open(dev_file, "w") as f3:
        n = 0
        for line in f1:
            sent_li = line.split()
            sents_idx = tokenizer.wtoi(sent_li)
            sents_istr = " ".join([str(i) for i in sents_idx])
            if n < nsplit:
                f2.write(sents_istr+"\n")
                n += 1
            else:
                f3.write(sents_istr+"\n")
                n += 1

## 中文处理

In [7]:
nsplit = 180000  # 总共的条数已经提前计算过是213377，划分18W做训练集
# 原始文件中的词汇，按照词频排序
words = build_sorted_words(zh_file)
# 构建中文词汇表
bulid_vocab(words, ZH_VOCAB_SIZE, ZH_VOCAB_OUTPUT)
# 中文编码后存入文件，并划分训练集和测试集
split_datafile(zh_file, ZH_TRAIN, ZH_DEV, ZH_VOCAB_OUTPUT, nsplit)

Build ./data/train/zh.vocab done.


## 英文处理

In [8]:
words = build_sorted_words(en_file)
# 构建英文词汇表
bulid_vocab(words, EN_VOCAB_SIZE, EN_VOCAB_OUTPUT)
# 英文编码后存入文件，并划分训练集和测试集
split_datafile(en_file, EN_TRAIN, EN_DEV, EN_VOCAB_OUTPUT, nsplit)

Build ./data/train/en.vocab done.
