In [77]:
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments, pipeline, AutoConfig
from datasets import Dataset

from pathlib import Path
from pprint import pprint
import random
import torch


random.seed(2017)

MODEL_PATH = "/home/mailab017/data_mailab017/hucheng/models/roberta"

In [78]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

In [79]:
def read_dataset(dataset_path: str):
    with open(dataset_path, 'r', encoding='utf-8') as f:
        return f.read(-1).splitlines()

def get_dataset(dataset_path: str, kind: str, style: str='sentiment'):
    
    dataset = []
    for i in range(2):
        file_name = "{}.{}.{}".format(style, kind, i)
        file_path = Path.joinpath(Path(dataset_path), file_name)
        dataset.extend([
            { 'label': i, 'text': line } for line in read_dataset(file_path)
        ])

    # 使用shard可以进行划分(num_shards, index)
        
    random.shuffle(dataset)

    dataset = Dataset.from_list(dataset)
    def tokenize_function(examples):
        return tokenizer(
            examples['text'], 
            padding='max_length', 
            truncation=True,
            max_length=256,
        )
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    return tokenized_dataset

In [80]:
dataset_path = './data/yelp/'

train_dataset = get_dataset(dataset_path, 'train')
test_dataset = get_dataset(dataset_path, 'test')

Map: 100%|██████████| 444101/444101 [00:59<00:00, 7525.67 examples/s]
Map: 100%|██████████| 126670/126670 [00:15<00:00, 8000.50 examples/s]


In [85]:
# %%capture result
# training

model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)

if torch.cuda.is_available():
    device = torch.device('cuda')
    model.to(device)
else:
    print("cuda not found!")
    exit()

training_args = TrainingArguments(
    output_dir="./model/",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir = './log/roberta',
    logging_strategy = "steps",
    logging_steps = 10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
)

trainer =Trainer(
    model = model,
    args=training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
)

# trainer.train(resume_from_checkpoint=True)
trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /home/mailab017/data_mailab017/hucheng/models/roberta and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2927,0.283234
2,0.3255,0.286717
3,0.3323,0.330249




TrainOutput(global_step=10410, training_loss=0.3328937906524977, metrics={'train_runtime': 27942.7859, 'train_samples_per_second': 47.68, 'train_steps_per_second': 0.373, 'total_flos': 6.208074755779922e+17, 'train_loss': 0.3328937906524977, 'epoch': 3.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.7075942158699036}

In [75]:
test_model_path = './model/checkpoint-5000/'

config = AutoConfig.from_pretrained(test_model_path, local_files_only=True)
config.update({"id2label": {
    "0": "negative",
    "1": "positive"
}})

classifier = pipeline(task="sentiment-analysis", model=test_model_path, tokenizer=MODEL_PATH)


refs:

* https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer
* https://hackernoon.com/fine-tuning-roberta-for-topic-classification 
* https://huggingface.co/learn/nlp-course/zh-CN/chapter3/4?fw=pt
* how show the result: https://zhuanlan.zhihu.com/p/452438381

In [76]:
text = "second , the steie , it is atrocious ."
preds = classifier("This movie is disgustingly good !")
# print('result:', result)
print(preds)

[{'label': 'LABEL_1', 'score': 0.589378833770752}]


In [58]:
train_dataset[4]

{'label': 0,
 'text': 'shame on them for no chips and salsa !',
 'input_ids': [0,
  1193,
  4344,
  15,
  106,
  13,
  117,
  8053,
  8,
  33367,
  27785,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
