In [1]:
from transformers import RobertaTokenizer
# 简单测试过基础分词器的分词效果还可以，类似"XGT and GBL"，
# 并不会将XGT与GBL拆开。其中0表示开头2表示结尾。
# 分词器大概是256，大多句子并不会超过这个阈值。
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text = "XGT and GBL"
print(tokenizer(text))

{'input_ids': [0, 1000, 18266, 8, 272, 7976, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [2]:
from datasets import load_dataset, DatasetDict
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

In [3]:
# 加载分词器,挂了代理无法加载？
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [4]:
# 分词函数，将文本转换为模型需要的格式
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [5]:
# 加载数据集
dataset = load_dataset('csv', data_files='./dataset.csv')
# 手动划分数据集为训练集和测试集
dataset = dataset['train'].train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test']
})

In [6]:
# 使用分词函数处理数据集
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/81939 [00:00<?, ? examples/s]

Map:   0%|          | 0/20485 [00:00<?, ? examples/s]

In [7]:
# 指定模型的类别数量
num_labels = len(set(dataset['train']['label']))

In [8]:
# 加载预训练模型,初次加载会下载模型，不过hf明明被封了。。。但是还能不挂代理下载。
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 定义训练参数，此参数为低配cpu，服务器运行参数需要重新调试。
training_args = TrainingArguments(
    output_dir='./results',          # 模型输出文件夹
    num_train_epochs=20,             # 训练轮次
    per_device_train_batch_size=2000,# 每个设备的训练批量大小
    per_device_eval_batch_size=2000, # 每个设备的评估批量大小
    warmup_steps=0,                  # 预热步骤
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志文件夹
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,                # 多少步骤记录一次日志
    load_best_model_at_end=True,     # 训练结束后载入最佳模型
)

In [10]:
# 初始化训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

In [14]:
# 加载图像化工具，使用pip install tensorboard安装
%load_ext tensorboard  
%tensorboard --logdir ./logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Launching TensorBoard...

In [None]:
# 开始训练
trainer.train()