# 基于MindSpore框架的MASS案例实现
## 1 模型简介
微软亚洲研究院于2019在ICML发表《MASS: Masked Sequence to Sequence Pre-training for Language Generation》，其借鑑了Bert的Masked Language Model预训练任务，提出了MAsked序列到序列预训练（MASS）模型，为语言生成任务联合预训练编码器和解码器。

### 1.1 模型结构


MASS 对句子随机屏蔽一个长度为 k 的连续片段，然后通过编码器 - 注意力 - 解码器模型预测生成该片段。
1) 通过序列到序列框架预测被遮掩的token，MASS强制编码器理解未遮掩的token的含义，并鼓励解码器从编码器端提取有用信息。
2) 通过在解码器端预测连续的标记，解码器可以比仅预测离散标记擁有更好的语言建模能力。
3) 解码器的输入进一步遮掩了在编码器端未遮掩的內容（例如在预测片段 x3x4x5x6 时，仅将 x3x4x5 作为输入，其他token用 [M] 屏蔽）， 鼓励解码器从编码器端提取更多有用的信息，不只是利用已有的信息。

MASS的编码器-注意力-解码器结构，其中“_”表示被屏蔽掉的token。
![](https://i.imgur.com/Jvhm0Dx.png)

其模型基础结构可以使用任何Seq2Seq的结构，由于Transformer的优越性，故论文中使用6层Transformer Layer作为Encoder和Decoder的基础结构。

### 1.2 模型特点
MASS 有一个重要的超参数 k，表示屏蔽的连续片段长度，通过调整 k 的大小，MASS 能包含 BERT 中的掩碼语言模型训练方法以及 GPT 中标准的语言模型预训练方法，使 MASS 成为一个通用的预训练框架（当 K=1 或者 m 时，MASS 的概率形式分别和 BERT 中的掩碼语言模型以及 GPT 中的标准语言模型一致）。

## 案例实现

### 2.1 准备数据集
案例实现中预训练模型所使用的数据即News Crawl的英语单语数据数据集，下载好的数据集为一纯文字文件，接下来需要对该数据进行预处理，预处理包括对数据进行分词、利用subword-nmt工具做bpe编码、对分词后的语料应用该bpe编码并构建词彙表等工作。

而微调模型用于文本摘要任务所使用的数据集为Gigaword，该数据集已经有分割为训练、测试、验证集，有原文本(src)和目标摘要(tgt)两个文件，本案例只会使用训练及与测试集，数据集文件路径结构如下：

.Dataset/
└── news_crawl
    └── news.2015.txt
└── ggw_data
    ├── test.src.txt
    ├── test.tgt.txt
    ├── train.src.txt
    └── train.tgt.txt

In [None]:
"""對數據進行分詞"""
import os
from nltk.tokenize import word_tokenize

src_folder = "/Users/dawnkaslana/Workspace/Dataset/news_crawl/"
out_folder = "./tokenized_corpus/"

def create_tokenized_sentences(file_path, tokenized_file):
    tokenized_sen = []
    print(f" | Processing {file_path}.")
    with open(file_path, "r") as file:
        for sen in file:
            tokens = word_tokenize(sen)
            tokens = [t for t in tokens if t != " "]
            if len(tokens) > 175:
                continue
            tokenized_sen.append(" ".join(tokens) + "\n")

    with open(tokenized_file, "w") as file:
        file.writelines(tokenized_sen)
    print(f" | Wrote to {tokenized_file}.")

for file in os.listdir(src_folder):
    if not file.endswith(".txt"):
        continue
    file_path = os.path.join(src_folder, file)
    tokenized_file = os.path.join(out_folder, file.replace(".txt", "_tokenized.txt"))
    create_tokenized_sentences(file_path, tokenized_file)

In [None]:
"""利用subword-nmt工具生成bpe檔案"""
src_folder_path = '/Users/dawnkaslana/Workspace/Dataset/news_crawl/' # source text folder path.
os.system("cd %s && cat *.txt | subword-nmt learn-bpe -s 46000 -o all.bpe.codes" % (src_folder_path))

In [None]:
"""應用該bpe檔案並構建詞彙表."""
from src.utils import Dictionary
import subprocess

source_folder = os.path.abspath("./tokenized_corpus/")
output_folder = os.path.abspath("./tokenized_corpus/bpe/")
codes = os.path.abspath("./all.bpe.codes")
vocab_path = "./vocab/vocab_en.dict.bin"

ENCODER = "subword-nmt apply-bpe -c"
LEARN_DICT = "subword-nmt get-vocab -i"
def bpe_encode(codes_path, src_path, output_path, dict_path):
    # Encoding.
    print(" | Applying BPE encoding.")
    commands = ENCODER.split() + [codes_path] + ["-i"] + [src_path] + ["-o"] + [output_path]
    subprocess.call(commands)
    print(" | Fetching vocabulary from single file.")
    # Learn vocab.
    commands = LEARN_DICT.split() + [output_path] + ["-o"] + [dict_path]
    subprocess.call(commands)

available_dict = []
for file in os.listdir(source_folder):
    if file.endswith(".txt"):
        output_path = os.path.join(output_folder, file.replace(".txt", "_bpe.txt"))
        dict_path = os.path.join(output_folder, file.replace(".txt", ".dict"))
        available_dict.append(dict_path)
        bpe_encode(codes, os.path.join(source_folder, file), output_path, dict_path)

# 加载bpe_encode處理過的文本词汇表，行格式为word frequency。
vocab = Dictionary.load_from_text(available_dict)
vocab.persistence(vocab_path) #将词汇表对象保存为二进制文件。
print(f" | Vocabulary Size: {len(vocab)}")

### 2.2 生成NewsCrawl數據集

In [13]:
"""Create News Crawl Pre-Training Dataset."""
import os
from src.dataset import MonoLingualDataLoader
from src.language_model import LooseMaskedLanguageModel
from src.utils import Dictionary

input_folder_path = '/Users/dawnkaslana/Workspace/Dataset/news_crawl/'
output_folder_path = './train_data/news_crawl_dataset/'
vocab_path = './vocab/vocab_en.dict.bin'

def create_pre_train(text_file, max_sen_len):
    vocab = Dictionary.load_from_persisted_dict(vocab_path)

    loader = MonoLingualDataLoader(
        src_filepath=text_file,
        lang="en", dictionary=vocab,
        language_model=LooseMaskedLanguageModel(mask_ratio=0.4, mask_all_prob=None),
        max_sen_len=max_sen_len, min_sen_len=10
    )

    src_file_name = os.path.basename(text_file)

    file_name = os.path.join(
        output_folder_path,
        src_file_name.replace('.txt', f'_len_{max_sen_len}.tfrecord')
    )
    loader.write_to_tfrecord(path=file_name)

for file in os.listdir(input_folder_path):
    if file.endswith(".txt"):
        create_pre_train(os.path.join(input_folder_path, file), 32)

print(f" | Generate Dataset for Pre-training is done.")
print(f" | Vocabulary size: {vocab.size}.")

 | Processing corpus /Users/dawnkaslana/Workspace/Dataset/news_crawl/news2007.txt.
 | Shortest len = 1.
 | Longest  len = 3269.
 | Total    sen = 2573547.
 | Write to /Users/dawnkaslana/Workspace/HWmodels/official/nlp/mass/train_data/news_crawl_dataset/news2007_len_32.tfrecord-001-of-001.
 | Generate Dataset for Pre-training is done.
 | Vocabulary size: 353.


### 2.2 生成Gigaword數據集

In [3]:
"""Generate Gigaword dataset."""
import os
from src.dataset import BiLingualDataLoader
from src.language_model import NoiseChannelLanguageModel
from src.utils import Dictionary

input_folder_path = '/Users/dawnkaslana/Workspace/Dataset/ggw_data/'
output_folder_path = './train_data/gigaword_dataset/'
vocab_path = './vocab/vocab_en.dict.bin'

vocab = Dictionary.load_from_persisted_dict(vocab_path)

train = BiLingualDataLoader(
    src_filepath=os.path.join(input_folder_path,"train.src.txt"),
    tgt_filepath=os.path.join(input_folder_path,"train.tgt.txt"),
    src_dict=vocab, tgt_dict=vocab,
    src_lang="en", tgt_lang="en",
    language_model=NoiseChannelLanguageModel(add_noise_prob=0.),
    max_sen_len=32
)

train.write_to_tfrecord(
    path=os.path.join(output_folder_path, "gigaword_train_dataset.tfrecord")
)

test = BiLingualDataLoader(
    src_filepath=os.path.join(input_folder_path,"test.src.txt"),
    tgt_filepath=os.path.join(input_folder_path,"test.tgt.txt"),
    src_dict=vocab, tgt_dict=vocab,
    src_lang="en", tgt_lang="en",
    language_model=NoiseChannelLanguageModel(add_noise_prob=0),
    max_sen_len=32
)

test.write_to_tfrecord(
    path=os.path.join(output_folder_path, "gigaword_test_dataset.tfrecord")
)

print(f" | Generate Dataset for fine-tuneing is done.")
print(f" | Vocabulary size: {vocab.size}.")

 | Processing corpus /Users/dawnkaslana/Workspace/Dataset/ggw_data/org_data/train.src.txt.
 | Processing corpus /Users/dawnkaslana/Workspace/Dataset/ggw_data/org_data/train.tgt.txt.
 | Shortest len = 3.
 | Longest  len = 100.
 | Total    sen = 1981314.
 | Total token num=87383811, 87.10378401784284% replaced by <unk>.
 | Write to /Users/dawnkaslana/Workspace/HWmodels/official/nlp/mass/train_data/gigaword_dataset/gigaword_train_dataset.tfrecord-001-of-001.
 | Processing corpus /Users/dawnkaslana/Workspace/Dataset/ggw_data/org_data/test.src.txt.
 | Processing corpus /Users/dawnkaslana/Workspace/Dataset/ggw_data/org_data/test.tgt.txt.
 | Shortest len = 2.
 | Longest  len = 73.
 | Total    sen = 1081.
 | Total token num=46933, 85.92248524492362% replaced by <unk>.
 | Write to /Users/dawnkaslana/Workspace/HWmodels/official/nlp/mass/train_data/gigaword_dataset/gigaword_test_dataset.tfrecord-001-of-001.
 | Generate Dataset for fine-tuneing is done.
 | Vocabulary size: 353.


## 預訓練newscrawl數據
python3 train.py --device_target GPU --output_path './output'
要在機器上跑訓練和微調和推理才行
用公用機寫這三點的檔案
### 取得設定參數

In [9]:
"""configuration """
import ml_collections
def get_config():
    config = ml_collections.ConfigDict()
    config.dtype = 'float32' #only support float16 and float32
    config.pre_train_dataset = os.path.abspath("./train_data/news_crawl_dataset/news2007_len_32.tfrecord-001-of-001")
    config.epochs = 20
    config.batch_size = 64
    config.checkpoint_path = "./output/checkpoint/"
    config.checkpoint_path = ""
    config.ckpt_prefix = "ckpt"
    config.lr = 0.0001
    config.seq_length = 32 #數據集的seq_length
    config.vocab_size = 353 #數據集的vocab_size
    config.hidden_size = 1024
    config.beam_width = 4
    config.optimizer = "adam"
    config.metric = "rouge"
    config.dataset_sink_mode = False
    config.dataset_sink_step = 100
    return config

In [None]:
import os

from src.dataset import load_dataset

config = get_config()

print(f" | Starting training on {os.getenv('RANK_SIZE', None)} devices.")

pre_train_dataset = load_dataset(data_files=config.pre_train_dataset,
                                     batch_size=config.batch_size,
                                     epoch_count=1,
                                     sink_mode=config.dataset_sink_mode,
                                     sink_step=config.dataset_sink_step)


