### Workflow
1. Data Preprocessing
2. Model Training
3. Testing

Tutorial [link](https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook)

In [5]:
import numpy as np
import pandas as pd
import re

In [11]:
def cleaning(s):
    s = str(s) #turn to string
    #Remove unwanted characters
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [32]:
df = pd.read_csv("../datasets/Articles.csv", encoding="ISO-8859-1")
df = df.dropna()
text_data = open('../datasets/Articles.txt', 'w', encoding="utf-8")
# Must have encoding when write
for idx, item in df.iterrows():
  article = cleaning(item["Article"])
  text_data.write(article)
text_data.close()

In [33]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [39]:
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm = mlm
    )
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=overwrite_output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            num_train_epochs=num_train_epochs,
        )

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )
        
    trainer.train()
    trainer.save_model()

In [35]:
train_file_path = "../datasets/Articles.txt"
model_name = 'gpt2'
output_dir = '/notebooks/GPT2_result'
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [40]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=False,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Downloading (…)"pytorch_model.bin";:  73%|███████▎  | 398M/548M [00:14<00:04, 30.1MB/s] 