In [6]:
import re
import torch
from datasets import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

In [7]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    cleaned_lines = [line.strip() for line in lines if not re.match(r'^\*.*$', line)]
    
    return cleaned_lines

def process_data(data):
    processed_data = {"Instruction": [], "Answer": []}
    for i, line in enumerate(data):
        if i % 2 == 0:
            processed_data["Instruction"].append(line)
        else:
            processed_data["Answer"].append(line)

    return processed_data

file_path = './ganyu-eng.txt'
data = read_file(file_path)
processed_data = process_data(data)
dataset = Dataset.from_dict(processed_data)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('./test/checkpoint-3261')
model = GPT2LMHeadModel.from_pretrained('./test/checkpoint-3261')
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Instruction'])):
        text = f"### Question: {example['Instruction'][i]}\n ### Answer: {example['Answer'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=SFTConfig(output_dir="./test02"),
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

trainer.train()

Map: 100%|██████████| 87/87 [00:00<00:00, 793.34 examples/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 33/33 [01:24<00:00,  2.55s/it]

{'train_runtime': 84.135, 'train_samples_per_second': 3.102, 'train_steps_per_second': 0.392, 'train_loss': 3.5820317123875474, 'epoch': 3.0}





TrainOutput(global_step=33, training_loss=3.5820317123875474, metrics={'train_runtime': 84.135, 'train_samples_per_second': 3.102, 'train_steps_per_second': 0.392, 'total_flos': 25410139776000.0, 'train_loss': 3.5820317123875474, 'epoch': 3.0})

In [18]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
model.resize_token_embeddings(len(tokenizer))


Map: 100%|██████████| 52/52 [00:00<00:00, 436.65 examples/s]


Embedding(50258, 768)

In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset 
)

In [20]:
trainer.train()

100%|██████████| 39/39 [00:38<00:00,  1.02it/s]

{'train_runtime': 38.1767, 'train_samples_per_second': 4.086, 'train_steps_per_second': 1.022, 'train_loss': 13.900725535857372, 'epoch': 3.0}





TrainOutput(global_step=39, training_loss=13.900725535857372, metrics={'train_runtime': 38.1767, 'train_samples_per_second': 4.086, 'train_steps_per_second': 1.022, 'total_flos': 10190389248000.0, 'train_loss': 13.900725535857372, 'epoch': 3.0})

In [22]:
model.save_pretrained('./results/GPT2')
tokenizer.save_pretrained('./results/GPT2')

('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.json',
 './results\\merges.txt',
 './results\\added_tokens.json')