# 使用自定义数据集训练PromptCLUE模型

In [1]:
# 引入相应的包 Importing libraries
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import time, json
# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# rich: for a better display on terminal
from rich.table import Column, Table
from rich import box
from rich.console import Console
print("end2...")

end2...


In [2]:
# 做一些相关的配置(打印显示；GPU设置)
# define a rich console logger
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    # console.print(table) # TODO TODO TODO 

# training logger to log training progress
training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# Setting up the device for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("end...")

end...


# Dataset Class 自定义数据集类

In [3]:
class SmallSampleDataSetClass(Dataset):
    """
    创建一个自定义的数据集，用于训练，必须包括两个字段：输入(如source_text)、输出（如target_text）
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text] if target_text is not None else None
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""
        if self.target_text is not None:
            return len(self.target_text)
        else:
            return len(self.source_text)

    def __getitem__(self, index):
        
        if self.target_text is not None:
            """return the input ids, attention masks and target ids"""

            source_text = str(self.source_text[index])
            target_text = str(self.target_text[index])

            # cleaning data so as to ensure data is in string type
            source_text = " ".join(source_text.split())
            target_text = " ".join(target_text.split())

            source = self.tokenizer.batch_encode_plus(
                [source_text],
                max_length=self.source_len,
                pad_to_max_length=True,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )
            target = self.tokenizer.batch_encode_plus(
                [target_text],
                max_length=self.summ_len,
                pad_to_max_length=True,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )

            source_ids = source["input_ids"].squeeze()
            source_mask = source["attention_mask"].squeeze()
            target_ids = target["input_ids"].squeeze()
            target_mask = target["attention_mask"].squeeze()

            return {
                "source_ids": source_ids.to(dtype=torch.long),
                "source_mask": source_mask.to(dtype=torch.long),
                "target_ids": target_ids.to(dtype=torch.long),
                "target_ids_y": target_ids.to(dtype=torch.long),
            }
        else:
            """return the input ids, attention masks and target ids"""

            source_text = str(self.source_text[index])

            # cleaning data so as to ensure data is in string type
            source_text = " ".join(source_text.split())

            source = self.tokenizer.batch_encode_plus(
                [source_text],
                max_length=self.source_len,
                pad_to_max_length=True,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )
            
            source_ids = source["input_ids"].squeeze()
            source_mask = source["attention_mask"].squeeze()

            return {
                "source_ids": source_ids.to(dtype=torch.long),
                "source_mask": source_mask.to(dtype=torch.long)
            }
print("end...")

end...


# 训练方法 Train

In [4]:
def train(epoch, tokenizer, model, device, loader, optimizer, scheduler):

    """
    用于训练的方法
    Function to be called for training with the parameters passed from main function

    """
    n_gpu = torch.cuda.device_count()

    model.train()
    time1=time.time()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous() # target, from start to end(except end of token, <EOS>). e.g. "你好吗？"
        lm_labels = y[:, 1:].clone().detach() # target, for second to end.e.g."好吗？<EOS>"
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 # releted to pad_token and loss. for detail, check here: https://github.com/Shivanandroy/T5-Finetuning-PyTorch/issues/3
        ids = data["source_ids"].to(device, dtype=torch.long) # input. e.g. "how are you?"
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]
        if n_gpu > 1:
            loss = loss.mean()
            
        # 每100步打印日志
        if _ % 100 == 0 and _!=0:
            time2=time.time()
            print(_,"epoch: " + str(epoch) + "; loss:{:.4f}; each step's time spent:{:.2f}".format(loss.detach().cpu().numpy(), float(time2-time1) / float(_ + 0.0001)))
            # training_logger.add_row(str(epoch), str(_), str(loss))
            # console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
print("end...")

end...


# 用于验证的方法 Validate

In [5]:
def validate(epoch, tokenizer, model, device, loader,max_length):

    """
    用于验证的方法：输入用于验证的数据，返回模型预测的结果和正确的标签
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=max_length, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%1000==0:
                console.print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals
print("end...")

end...


# 训练类 Trainer

In [6]:
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedShuffleSplit

# 训练类：整合数据集类、训练方法、验证方法，加载数据进行训练并验证训练过程的效果
def T5Trainer(
    dataframe, source_text, target_text, model_params, output_dir="./outputs/prompt/"
):
    """
    T5 trainer
    """
    n_gpu = torch.cuda.device_count()
    
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using PromptCLUE model and added a Language model layer on top for generation of prediction.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
        # model = model.module.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]
    # display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # Defining the train size So 94% of the data will be used for training and the rest for validation.
    # train_size = 0.8
    # train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    # val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    # train_dataset = train_dataset.reset_index(drop=True)
    
    # 分层抽样
    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=model_params["SEED"])
    strat_train_set = []
    strat_test_set = []
    
    for train_index, test_index in ss.split(dataframe, dataframe['target']):
        strat_train_set = dataframe.iloc[train_index, :]
        strat_test_set = dataframe.iloc[test_index, :]
        
    train_dataset = strat_train_set.reset_index(drop=True)
    val_dataset = strat_test_set.reset_index(drop=True)
        
    # 打印数据集相关日志：数据量、训练步数
    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALID Dataset: {val_dataset.shape}\n")
    total_train_steps = int((train_dataset.shape[0] * model_params["TRAIN_EPOCHS"]) / model_params["TRAIN_BATCH_SIZE"])
    console.print(f"Total Train Steps: {total_train_steps}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = SmallSampleDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = SmallSampleDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    # optimizer = torch.optim.Adam(
    #     params=model.parameters(), lr=model_params["LEARNING_RATE"]
    # )
    optimizer = torch.optim.AdamW(model.parameters(), model_params["LEARNING_RATE"])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_train_steps, num_training_steps=total_train_steps)

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        # 1) train for one epoch
        train(epoch, tokenizer, model, device, training_loader, optimizer, scheduler)
        
        # 2) save model for each epoch
        console.log(f"[Saving Model]...\n")
        path = os.path.join(output_dir, "model_files")
        model.module.save_pretrained(path)
        tokenizer.save_pretrained(path)

        torch.cuda.empty_cache()
        # 3) evaluating test dataset
        console.log(f"[Initiating Validation]...\n")
        with torch.no_grad(): # add 2022.10.4
            #for epoch in range(model_params["VAL_EPOCHS"]):
            predictions, actuals = validate(epoch, tokenizer, model.module, device, val_loader,model_params["MAX_TARGET_TEXT_LENGTH"])
            final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
            final_df.to_csv(os.path.join(output_dir, "predictions.csv"), encoding='utf8', index=None, sep=',')

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")
print("end...")

end...


In [8]:
# 定义模型的参数 let's define model parameters specific to T5
model_params = {
    "MODEL": "output/prompt/model_files",  # model_type pretrained_models/PromptCLUE-base
    "TRAIN_BATCH_SIZE": 10,  # training batch size, 8
    "VALID_BATCH_SIZE": 14,  # validation batch size,8 
    "TRAIN_EPOCHS": 7,  # number of training epochs
    "VAL_EPOCHS": 7,  # number of validation epochs
    "LEARNING_RATE": 4e-5,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 1024,  # max length of source text, 512
    "MAX_TARGET_TEXT_LENGTH": 8,  # max length of target text,64
    "SEED": 2022,  # set seed for reproducibility
}
print("end...")

end...


In [9]:
# 训练模型
# 使用 pCLUE:1200000+多任务提示学习数据集 的部分数据
# dataframe必须有2列: 
#   - input: 文本输入
#   - target: 目标输出
df = pd.read_csv('data/prompt/train6452_prompt.tsv', sep='\t', encoding='utf8')  # 数据量：1200k数据。
# df = df.sample(frac=0.01) # TODO  取消本行代码，如果你需要更多数据训练
print("df.head:",df.head(n=5))
print("df.shape:",df.shape)
# 显存占用说明：如果运行现在显存不足，请使用nvidia-smi查看显存；如果显卡多数被占用了，请重启colab程序
T5Trainer(
    dataframe=df,
    source_text="input",
    target_text="target",
    model_params=model_params,
    output_dir="output/prompt/",
)

torch.cuda.empty_cache()
print("end..")

df.head:                                                input    target
0  这是哪种类别的专利？_一种信号的发送方法及基站、用户设备_一种信号的发送方法及基站、用户设备...  通信/信号/天线
1  这是哪种类别的专利？_一种用于配置无线路由设备的方法与设备。_本申请的目的是提供一种用于配置...  通信/信号/天线
2  这是哪种类别的专利？_用于上行链路多用户传输的ACK/NACK信号处理方法及装置。_本文件涉...  通信/信号/天线
3  这是哪种类别的专利？_一种便于安装和使用的路由器。_本实用新型公开了一种便于安装和使用的路由...  通信/信号/天线
4  这是哪种类别的专利？_一种哈希冲突的处理方法、装置及交换设备。_本发明提供了一种MAC地址哈...  通信/信号/天线
df.shape: (22888, 2)




100 epoch: 0; loss:0.0508; each step's time spent:1.07
200 epoch: 0; loss:0.0154; each step's time spent:1.02
300 epoch: 0; loss:0.0267; each step's time spent:1.01
400 epoch: 0; loss:0.0251; each step's time spent:1.00
500 epoch: 0; loss:0.0102; each step's time spent:0.99
600 epoch: 0; loss:0.0361; each step's time spent:0.99
700 epoch: 0; loss:0.0244; each step's time spent:0.99
800 epoch: 0; loss:0.0574; each step's time spent:0.98
900 epoch: 0; loss:0.0061; each step's time spent:0.98
1000 epoch: 0; loss:0.0110; each step's time spent:0.98
1100 epoch: 0; loss:0.0626; each step's time spent:0.98
1200 epoch: 0; loss:0.1157; each step's time spent:0.98
1300 epoch: 0; loss:0.1111; each step's time spent:0.98
1400 epoch: 0; loss:0.0121; each step's time spent:0.98
1500 epoch: 0; loss:0.0088; each step's time spent:0.98
1600 epoch: 0; loss:0.0363; each step's time spent:0.98
1700 epoch: 0; loss:0.0298; each step's time spent:0.98
1800 epoch: 0; loss:0.1314; each step's time spent:0.98


100 epoch: 1; loss:0.0679; each step's time spent:0.98
200 epoch: 1; loss:0.0096; each step's time spent:0.97
300 epoch: 1; loss:0.0328; each step's time spent:0.97
400 epoch: 1; loss:0.0469; each step's time spent:0.97
500 epoch: 1; loss:0.0139; each step's time spent:0.97
600 epoch: 1; loss:0.0228; each step's time spent:0.97
700 epoch: 1; loss:0.0499; each step's time spent:0.97
800 epoch: 1; loss:0.0187; each step's time spent:0.97
900 epoch: 1; loss:0.0970; each step's time spent:0.97
1000 epoch: 1; loss:0.1187; each step's time spent:0.97
1100 epoch: 1; loss:0.0238; each step's time spent:0.97
1200 epoch: 1; loss:0.0717; each step's time spent:0.97
1300 epoch: 1; loss:0.0405; each step's time spent:0.97
1400 epoch: 1; loss:0.0011; each step's time spent:0.97
1500 epoch: 1; loss:0.0016; each step's time spent:0.97
1600 epoch: 1; loss:0.0132; each step's time spent:0.97
1700 epoch: 1; loss:0.0455; each step's time spent:0.97
1800 epoch: 1; loss:0.0635; each step's time spent:0.97


100 epoch: 2; loss:0.0079; each step's time spent:1.00
200 epoch: 2; loss:0.0019; each step's time spent:0.99
300 epoch: 2; loss:0.0059; each step's time spent:0.99
400 epoch: 2; loss:0.0050; each step's time spent:0.99
500 epoch: 2; loss:0.0010; each step's time spent:0.99
600 epoch: 2; loss:0.0047; each step's time spent:0.99
700 epoch: 2; loss:0.0106; each step's time spent:0.98
800 epoch: 2; loss:0.0168; each step's time spent:0.98
900 epoch: 2; loss:0.0841; each step's time spent:0.98
1000 epoch: 2; loss:0.0576; each step's time spent:0.98
1100 epoch: 2; loss:0.0003; each step's time spent:0.98
1200 epoch: 2; loss:0.0017; each step's time spent:0.98
1300 epoch: 2; loss:0.0854; each step's time spent:0.98
1400 epoch: 2; loss:0.0181; each step's time spent:0.98
1500 epoch: 2; loss:0.0224; each step's time spent:0.98
1600 epoch: 2; loss:0.1780; each step's time spent:0.98
1700 epoch: 2; loss:0.1127; each step's time spent:0.97
1800 epoch: 2; loss:0.0605; each step's time spent:0.97


100 epoch: 3; loss:0.0599; each step's time spent:0.97
200 epoch: 3; loss:0.0019; each step's time spent:0.97
300 epoch: 3; loss:0.0020; each step's time spent:0.97
400 epoch: 3; loss:0.0301; each step's time spent:0.97
500 epoch: 3; loss:0.0941; each step's time spent:0.97
600 epoch: 3; loss:0.0010; each step's time spent:0.97
700 epoch: 3; loss:0.0003; each step's time spent:0.97
800 epoch: 3; loss:0.0295; each step's time spent:0.97
900 epoch: 3; loss:0.0234; each step's time spent:0.97
1000 epoch: 3; loss:0.0389; each step's time spent:0.97
1100 epoch: 3; loss:0.0014; each step's time spent:0.97
1200 epoch: 3; loss:0.0270; each step's time spent:0.97
1300 epoch: 3; loss:0.0114; each step's time spent:0.97
1400 epoch: 3; loss:0.0002; each step's time spent:0.97
1500 epoch: 3; loss:0.0047; each step's time spent:0.97
1600 epoch: 3; loss:0.0004; each step's time spent:0.97
1700 epoch: 3; loss:0.0536; each step's time spent:0.97
1800 epoch: 3; loss:0.0716; each step's time spent:0.97


100 epoch: 4; loss:0.0105; each step's time spent:0.98
200 epoch: 4; loss:0.0112; each step's time spent:0.97
300 epoch: 4; loss:0.0011; each step's time spent:0.97
400 epoch: 4; loss:0.0066; each step's time spent:0.97
500 epoch: 4; loss:0.0027; each step's time spent:0.97
600 epoch: 4; loss:0.0264; each step's time spent:0.97
700 epoch: 4; loss:0.0006; each step's time spent:0.97
800 epoch: 4; loss:0.0177; each step's time spent:0.97
900 epoch: 4; loss:0.0057; each step's time spent:0.97
1000 epoch: 4; loss:0.0002; each step's time spent:0.97
1100 epoch: 4; loss:0.0024; each step's time spent:0.97
1200 epoch: 4; loss:0.0001; each step's time spent:0.97
1300 epoch: 4; loss:0.0283; each step's time spent:0.96
1400 epoch: 4; loss:0.0171; each step's time spent:0.96
1500 epoch: 4; loss:0.0134; each step's time spent:0.96
1600 epoch: 4; loss:0.0376; each step's time spent:0.96
1700 epoch: 4; loss:0.0012; each step's time spent:0.96
1800 epoch: 4; loss:0.0460; each step's time spent:0.96


100 epoch: 5; loss:0.0382; each step's time spent:0.98
200 epoch: 5; loss:0.0013; each step's time spent:0.98
300 epoch: 5; loss:0.0021; each step's time spent:0.97
400 epoch: 5; loss:0.0055; each step's time spent:0.97
500 epoch: 5; loss:0.0239; each step's time spent:0.97
600 epoch: 5; loss:0.0008; each step's time spent:0.97
700 epoch: 5; loss:0.0010; each step's time spent:0.97
800 epoch: 5; loss:0.0193; each step's time spent:0.97
900 epoch: 5; loss:0.0023; each step's time spent:0.97
1000 epoch: 5; loss:0.0044; each step's time spent:0.97
1100 epoch: 5; loss:0.0006; each step's time spent:0.97
1200 epoch: 5; loss:0.0013; each step's time spent:0.97
1300 epoch: 5; loss:0.0039; each step's time spent:0.97
1400 epoch: 5; loss:0.0016; each step's time spent:0.97
1500 epoch: 5; loss:0.0000; each step's time spent:0.97
1600 epoch: 5; loss:0.0000; each step's time spent:0.97
1700 epoch: 5; loss:0.0004; each step's time spent:0.97
1800 epoch: 5; loss:0.0506; each step's time spent:0.97


100 epoch: 6; loss:0.0076; each step's time spent:0.98
200 epoch: 6; loss:0.0027; each step's time spent:0.97
300 epoch: 6; loss:0.0009; each step's time spent:0.97
400 epoch: 6; loss:0.0366; each step's time spent:0.97
500 epoch: 6; loss:0.0023; each step's time spent:0.97
600 epoch: 6; loss:0.0005; each step's time spent:0.97
700 epoch: 6; loss:0.0006; each step's time spent:0.97
800 epoch: 6; loss:0.0036; each step's time spent:0.97
900 epoch: 6; loss:0.0011; each step's time spent:0.97
1000 epoch: 6; loss:0.0056; each step's time spent:0.97
1100 epoch: 6; loss:0.0205; each step's time spent:0.97
1200 epoch: 6; loss:0.0181; each step's time spent:0.97
1300 epoch: 6; loss:0.0004; each step's time spent:0.97
1400 epoch: 6; loss:0.0005; each step's time spent:0.97
1500 epoch: 6; loss:0.0032; each step's time spent:0.97
1600 epoch: 6; loss:0.0018; each step's time spent:0.97
1700 epoch: 6; loss:0.0160; each step's time spent:0.97
1800 epoch: 6; loss:0.0002; each step's time spent:0.97


end..


# 预测 Test

In [10]:
def testing(tokenizer, model, device, loader, max_length):

    """
    用于预测的方法：输入用于预测的数据，返回模型预测的结果
    Function for predictions

    """
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=max_length, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0,
                early_stopping=True
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            if _%1000==0:
                console.print(f'Completed {_}')

            predictions.extend(preds)
    return predictions
print("end...")

end...


In [11]:
# 训练类：整合数据集类、训练方法、验证方法，加载数据进行训练并验证训练过程的效果
def T5Tester(
    dataframe, source_text, model_params, output_dir="./outputs/prompt/"
):
    """
    T5 tester
    """
    n_gpu = torch.cuda.device_count()
    
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True
    
    if n_gpu > 0:
        torch.cuda.manual_seed_all(model_params["SEED"])

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using PromptCLUE model and added a Language model layer on top for generation of prediction.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
        model = model.module.cuda()

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    # dataframe = dataframe[source_text]
    # display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # Defining the train size So 94% of the data will be used for training and the rest for validation.
    test_dataset = dataframe
    
    # 打印数据集相关日志：数据量、训练步数
    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TEST Dataset: {test_dataset.shape}")

    # Creating the Training and Validation dataset for further creation of Dataloader
    testing_set = SmallSampleDataSetClass(
        test_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text=source_text,
        target_text=None
    )

    # Defining the parameters for creation of dataloaders
    test_params = {
        "batch_size": model_params["TEST_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    testing_loader = DataLoader(testing_set, **test_params)
    
    # 3) evaluating test dataset
    console.log(f"[Initiating Prediction]...\n")
    with torch.no_grad(): # add 2022.10.4
        #for epoch in range(model_params["VAL_EPOCHS"]):
        predictions = testing(tokenizer, model, device, testing_loader, model_params["MAX_TARGET_TEXT_LENGTH"])
        final_df = pd.DataFrame({"Generated Text": predictions})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"), encoding='utf8', index=None)

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Prediction Completed.]\n")
    console.print(
        f"""[Prediction] Generation on Testing data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")
print("end...")

end...


In [12]:
# 定义模型的参数 let's define model parameters specific to T5
model_params = {
    "MODEL": "output/prompt/model_files/",  # model_type
    "TEST_BATCH_SIZE": 64,  # training batch size, 8
    "MAX_SOURCE_TEXT_LENGTH": 1024,  # max length of source text, 512
    "MAX_TARGET_TEXT_LENGTH": 10,  # max length of target text,64
    "SEED": 2022,  # set seed for reproducibility
}
print("end...")

end...


In [13]:
# 训练模型
# 使用 pCLUE:1200000+多任务提示学习数据集 的部分数据
# dataframe必须有2列: 
#   - input: 文本输入
#   - target: 目标输出
df = pd.read_csv('data/prompt/testA_prompt.tsv', sep='\t', encoding='utf8', header=0, names=["input"])  # 数据量：1200k数据。
# df = df.sample(frac=0.01) # TODO  取消本行代码，如果你需要更多数据训练
print("df.head:",df.head(n=5))
print("df.shape:",df.shape)
# 显存占用说明：如果运行现在显存不足，请使用nvidia-smi查看显存；如果显卡多数被占用了，请重启colab程序
T5Tester(
    dataframe=df,
    source_text="input",
    model_params=model_params,
    output_dir="output/prompt/prediction/",
)
torch.cuda.empty_cache() 
print("end..")

df.head:                                                input
0  这是哪种类别的专利？_一种耐磨、抗粘钢复合涂层、制备方法及应用。_本发明公开了一种耐磨、抗粘...
1  这是哪种类别的专利？_一种用于提高橡胶抗湿滑性的树脂的制备方法及其应用。_本发明公开了一种用...
2  这是哪种类别的专利？_有机硅改性丙烯酸树脂超亲水防雾涂料及其制作方法。_本发明涉及涂料制造领...
3  这是哪种类别的专利？_一种空调系统及其控制方法、控制装置。_本发明涉及空调领域，公开了一种空...
4  这是哪种类别的专利？_资源申请、分配方法，UE及网络控制单元。_本发明实施例公开了一种资源申...
df.shape: (20839, 1)


end..


In [None]:
# 查看训练后显存占用情况。如果显存被占用，可以kill掉相关的进程
!nvidia-smi
# !fuser -v /dev/nvidia*

In [18]:
# !nvidia-smi -r 
# 使用以下命令清除训练中残存的GPU显存缓存
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache()
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache() 
torch.cuda.empty_cache()  