# 蛋白质预训练和性质预测

在这份教程中，我们将介绍如何构建一个序列模型来进行蛋白质性质预测。具体来说，我们将展示如何对模型进行预训练并针对下游任务进行微调。

近年来，随着测序技术的发展，蛋白质序列数据库的规模显著扩大。然而，必须通过湿实验才能够获得的有标注蛋白序列的成本仍然很高。此外，由于标记样本数量不足，模型有很高的概率过拟合数据。借鉴自然语言处理（NLP）的思想，通过自监督学习可以在大量无标注的蛋白序列上进行预训练。这样，我们就可以从蛋白质序列中提取有用的生物信息，并将其迁移到其他有标注的任务中，使这些任务的训练速度更快和更稳定地收敛。本教程的内容参考了 TAPE 的工作，提供了 Transformer、LSTM 和 ResNet 的模型实现。

# 第一部分：预训练/训练

In [2]:
import os
import sys
os.chdir('../apps/pretrained_protein/tape')
sys.path.append('../../../')
sys.path.append('./')

## 加载相关工具

In [3]:
import paddle
from utils import *

paddle.enable_static() # paddle 版本 >= 2.0.0rc

is_distributed = False
use_cuda = False
thread_num = 8

# Setup the execution-related parameters according to the training modes.
exe_params = default_exe_params(is_distributed=is_distributed, use_cuda=use_cuda, thread_num=thread_num)
exe = exe_params['exe']
trainer_num = exe_params['trainer_num']
trainer_id = exe_params['trainer_id']
gpu_id = exe_params['gpu_id']
dist_strategy = exe_params['dist_strategy'] 
places = exe_params['places']

## 模型配置

模型的配置如下面的 `model_config` 所示。
- 任务相关的配置
    - "task"：训练任务的类型。可选的类型包括：
        - "pretrain"：使用自监督学习的方法的预训练任务，如数据集 `TAPE`。
        - "classification"：分类任务，如数据集 `Remote Homology`。
        - "regression"：回归任务，如数据集 `Fluroscence` 和 `Stability`。
        - "seq_classification"：序列分类任务，如数据集 `Secondary Structure`。
    - "class_num"：任务 `classification` 和 `seq_classification` 中类别的数量。
    - "label_name"：数据集中的标签名。
- 模型相关的配置
    - "model_type"：模型的类型。 对每个模型，我们需要指定相应的模型超参数。下面是我们支持的模型：
        - "transformer"
            - "hidden_size"
            - "layer_num"
            - "head_num"
        - "lstm"
            - "hidden_size"
            - "layer_num"
        - "resnet"
            - "hidden_size"
            - "layer_num"
            - "filter_size"
- 其他配置（更多细节请查阅代码）
    - "dropout_rate"
    - "weight_decay"
    
下面的 `model_config` 是模型配置的一个示例，任务的名称是 `secondary_structure`。

In [3]:
model_config = \
{
    "model_name": "secondary_structure",

    "task": "seq_classification",
    "class_num": 3,
    "label_name": "labels3",

    "model_type": "lstm",
    "hidden_size": 512,
    "layer_num": 3,

    "comment": "The following hyper-parameters are optional.",
    "dropout_rate": 0.1,
    "weight_decay": 0.01
}

## 模型定义

通常情况下可以使用 Paddle 中提供的 `Program` 和 `Executor` 来构建静态图。

In [4]:
from tape_model import TAPEModel # More details of the network structure are shown in tape_model.py.
from data_gen import setup_data_loader
from pahelix.utils.paddle_utils import load_partial_params

model = TAPEModel(model_config=model_config)

lr = 0.0001 # learning rate
batch_size = 32 # batch size
train_data = './demos/secondary_structure_toy_data'

# prepare train_program
train_program = fluid.Program()
train_startup = fluid.Program()
with fluid.program_guard(train_program, train_startup):
    with fluid.unique_name.guard():
        model.forward(False)
        model.cal_loss()

        # setup the optimizer
        optimizer = default_optimizer(lr=lr, warmup_steps=0, max_grad_norm=0.1)
        setup_optimizer(optimizer, model, use_cuda, is_distributed)
        optimizer.minimize(model.loss)
        
        # setup the data loader, which provides the training data
        train_data_loader = setup_data_loader(
                model.input_list,
                model_config,
                train_data,
                trainer_id,
                trainer_num,
                places,
                batch_size)
        exe.run(train_startup)

# init_model = "./pretrained_model" # we could load the pre-trained model
# load_partial_params(exe, init_model, test_program) # load the init_model

save_program = train_program
if not is_distributed:
    save_program = train_program
    train_program = fluid.compiler.CompiledProgram(train_program).with_data_parallel(loss_name=model.loss.name)


## 模型训练

In [5]:
task = model_config['task']
train_metric = get_metric(task) # evaluation metric
train_fetch_list = model.get_fetch_list() # information needed for prediction and evaluation
model_dir = "./model" # the directory to save the model

for epoch_id in range(2):
    print('Epoch %d' % epoch_id)
    train_metric.clear() # cleanup the evaluation metric
    for data in train_data_loader():
        results = exe.run(
                program=train_program,
                feed=data,
                fetch_list=train_fetch_list,
                return_numpy=False)
        update_metric(task, train_metric, results) # update the evaluation metric
        train_metric.show() # show the results of the metrics
    if trainer_id == 0:
        fluid.io.save_params(exe, '%s/epoch%d' % (model_dir, epoch_id), save_program) # save model


Epoch 0
	Example: 78011
	Accuracy: 0.309482
	Example: 144800
	Accuracy: 0.388985
Epoch 1
	Example: 78011
	Accuracy: 0.522811
	Example: 144800
	Accuracy: 0.515760


# 第二部分：模型推断

在这一部分，我们将简要介绍如何使用训练后的模型在给定的氨基酸序列上进行推断。

In [6]:
from pahelix.utils.paddle_utils import load_partial_params
from pahelix.utils.protein_tools import ProteinTokenizer
from data_gen import gen_batch_data

test_data = './demos/secondary_structure_toy_data'

# prepare test_program
test_program = fluid.Program()
test_startup = fluid.Program()
with fluid.program_guard(test_program, test_startup):
    with fluid.unique_name.guard():
        model.forward(True)
        test_data_loader = setup_data_loader(
                model.input_list,
                model_config,
                test_data,
                trainer_id,
                trainer_num,
                places,
                batch_size)
        exe.run(test_startup)
test_metric = get_metric(task)

init_model = "./model/epoch0" # the path of initialized model
load_partial_params(exe, init_model, test_program) # load the init_model

tokenizer = ProteinTokenizer() 
test_fetch_list = model.get_fetch_list(is_inference=True)

if use_cuda:
    place = fluid.CUDAPlace(gpu_id)
else:
    place = fluid.CPUPlace()

examples = [
    'MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR',
    'CCCACAGACTCAGAGAGAACCCACCATGGTGCTGTCTCCTGACGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCGCACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACCAAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTGGCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTCACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAATACCGTTAAGCTGGAGCCTCGGTGGCCATGCTTCTTGCCCCTTTGG',
]
inputs = gen_batch_data(examples, tokenizer, place) # data process: 1.change amino acid sequence to token ids and generate a batch
results = exe.run(
    program=test_program,
    feed=inputs,
    fetch_list=test_fetch_list,
    return_numpy=False)
pred = np.array(results[0])
print(pred)

Load parameters from ./model/epoch0.
[[0.33362675 0.3326527  0.33372056]
 [0.33355793 0.3325862  0.33385593]
 [0.33341178 0.3326046  0.33398363]
 ...
 [0.33260044 0.331602   0.33579758]
 [0.33269978 0.33188707 0.33541316]
 [0.33314735 0.3324862  0.33436644]]
