## 1.查看服务器GPU等硬件信息

In [1]:
import torch
print(torch.cuda.get_device_name())

Tesla P100-PCIE-16GB


## 2.缓存云盘上的模型和数据集到服务器

In [None]:
import os
import shutil
import zipfile

# 数据集的名称
data_set='CCF_2019.zip'
# 模型的路径
model='/content/gdrive/MyDrive/Models/pytorchmodels/FinBERT_L-12_H-768_A-12'

# 解压缩
def unzip(file_path,tar_path):

    # 压缩文件判断
    if os.path.splitext(file_path)[-1]=='.zip':

        zFile = zipfile.ZipFile(file_path, "r")
        for files in zFile.namelist(): 
            zFile.extract(files,tar_path)
        
        zFile.close()

# 下载到服务器
def down_data(dataset_name,model_path):

    GoogleDrive_PATH='/content/gdrive/MyDrive/DataSets/'
    Local_PATH='/content/LocalDataSets/'
    Local_Model_PATH='/content/LocalModels/'

    DataSetList=os.listdir(GoogleDrive_PATH)   
    print('DataSetList:',DataSetList)

    # 下载数据集
    if dataset_name in DataSetList:

        if not os.path.exists(Local_PATH):
            os.makedirs(Local_PATH)
            
        shutil.copy(GoogleDrive_PATH+dataset_name,Local_PATH+dataset_name)

        # 如果是压缩文件
        unzip(Local_PATH+dataset_name,Local_PATH)
    
    # 下载模型
    try:
        shutil.copytree(model_path,Local_Model_PATH+str(model_path).split('/')[-1])
    except FileExistsError as e:
        pass
    
down_data(data_set,model)

## 3.在服务器上安装环境

In [None]:
%pip install transformers datasets tensorboard

## 4.在预训练模型上进行微调

### 可视化

In [None]:
#  可视化
from torch.utils.tensorboard import SummaryWriter
log_writer = SummaryWriter()

run_path=''
%tensorboard --logdir=/Users/mac/PycharmProjects/pythonProject/saved/runs/{run_path}

### 微调

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import numpy as np

#-------------------- 模型和数据集路径 --------------------#
model_path='/content/LocalModels/FinBERT_L-12_H-768_A-12/'
dataset_path='/content/LocalDataSets/CCF_2019/'
saved_path='/content/gdrive/MyDrive/Saved_Models/FinBERT_L-12_H-768_A-12/'
if not os.path.exists(saved_path):
        os.makedirs(saved_path)

# 读取数据
def read_data(base_url):
    return load_dataset('csv',
                        data_files={'train': base_url + 'train.csv',
                                    'test': base_url + 'test.csv',
                                    'dev': base_url + 'dev.csv'})


# 编码训练集
def tokenize_data(tokenizer:BertTokenizer,word_length:int=32):
    # 加载数据集
    raw_datasets = read_data(dataset_path)

    # 向量化函数
    def tokenize_function(dataset):
        return tokenizer(dataset['title'], truncation=True, padding='max_length', max_length=word_length)

    tokenized_datasets = raw_datasets.map(tokenize_function,
                                          batched=True)

    # 重命名列
    tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

    return tokenized_datasets


#-------------------- 模型训练 --------------------#
def train_model(bert_path,class_num:int=3):

    # 获取预训练的编码器和模型
    tokenizer = BertTokenizer.from_pretrained(bert_path)
    model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=class_num)

    # 获得向量化后的数据
    tokenized_datasets = tokenize_data(tokenizer)

    # 定义评价指标
    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='micro')
        acc = accuracy_score(labels, predictions)
        result = {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

        return result


    #-------------------- 定义训练参数 --------------------#

    # output_dir = './saved/FinBERT'
    # tensorboard --logdir ./saved/FinBERT/runs
    args = TrainingArguments(
        output_dir=saved_path,  # 保存路径，存放检查点和其他输出文件
        evaluation_strategy='steps',  # 每50steps结束后进行评价
        eval_steps=50,
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="steps",
        save_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="tensorboard",
        # warmup_steps=500,  # 热身步数
        # weight_decay=0.01,  # 权重衰减
        learning_rate=2e-5,  # 初始学习率
        per_device_train_batch_size=64,  # 训练批次大小
        per_device_eval_batch_size=64,  # 测试批次大小
        num_train_epochs=4,  # 训练轮数

    )


    # 定义训练器
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets["dev"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )




    # log_writer = SummaryWriter()

    # tensorboard --logdir=/Users/mac/PycharmProjects/pythonProject/saved/runs/Jul09_23-01-26_localhost
    # tensorboard dev upload --logdir '/Users/mac/PycharmProjects/pythonProject/saved/runs/Jul09_23-01-26_localhost'

    # 开始训练
    trainer.train()

    # 训练完成以后的测试集评价
    trainer.evaluate(eval_dataset=tokenized_datasets['test'])

train_model(model_path,3)


## 推理任务