<a href="https://colab.research.google.com/github/AnDDoanf/learn_NLP/blob/master/notebooks/a_GPT2_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Workflow
1. Data Preprocessing
2. Model Training
3. Testing

Tutorial [link](https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook)

In [1]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
def cleaning(s):
    s = str(s) #turn to string
    #Remove unwanted characters
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [None]:
# df = pd.read_csv("../datasets/Articles.csv", encoding="ISO-8859-1")
# df = df.dropna()
# text_data = open('../datasets/Articles.txt', 'w', encoding="utf-8")
# # Must have encoding when write
# for idx, item in df.iterrows():
#   article = cleaning(item["Article"])
#   text_data.write(article)
# text_data.close()

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm = mlm
    )
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=overwrite_output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            num_train_epochs=num_train_epochs,
        )

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )
        
    trainer.train()
    trainer.save_model()

In [None]:
!wget https://raw.githubusercontent.com/AnDDoanf/learn_NLP/master/datasets/Articles.txt

--2023-03-09 01:44:02--  https://raw.githubusercontent.com/AnDDoanf/learn_NLP/master/datasets/Articles.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4721884 (4.5M) [text/plain]
Saving to: ‘Articles.txt’


2023-03-09 01:44:04 (218 MB/s) - ‘Articles.txt’ saved [4721884/4721884]



In [None]:
# run on local:
# train_file_path = "../datasets/Articles.txt"
# run on colab:
train_file_path = "Articles.txt"
model_name = 'gpt2'
output_dir = '/notebooks/GPT2_result'
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=False,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Step,Training Loss
500,3.6994
1000,3.4088
1500,3.1663
2000,3.1265
2500,2.9754
3000,2.9591
3500,2.8561
4000,2.8525
4500,2.7915
5000,2.7821


Saving model checkpoint to /notebooks/GPT2_result/checkpoint-500
Configuration saved in /notebooks/GPT2_result/checkpoint-500/config.json
Configuration saved in /notebooks/GPT2_result/checkpoint-500/generation_config.json
Model weights saved in /notebooks/GPT2_result/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /notebooks/GPT2_result/checkpoint-1000
Configuration saved in /notebooks/GPT2_result/checkpoint-1000/config.json
Configuration saved in /notebooks/GPT2_result/checkpoint-1000/generation_config.json
Model weights saved in /notebooks/GPT2_result/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /notebooks/GPT2_result/checkpoint-1500
Configuration saved in /notebooks/GPT2_result/checkpoint-1500/config.json
Configuration saved in /notebooks/GPT2_result/checkpoint-1500/generation_config.json
Model weights saved in /notebooks/GPT2_result/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /notebooks/GPT2_result/checkpoint-2000
Configuration saved in 

In [None]:
from transformers import PreTrainedTokenizer, GPT2TokenizerFast, GPT2LMHeadModel, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/notebooks/GPT2_result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = input()
max_len = int(input())
generate_text(sequence, max_len)

News
50


loading configuration file /notebooks/GPT2_result/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50257
}

loadin

