<a href="https://colab.research.google.com/github/AnDDoanf/learn_NLP/blob/master/notebooks/a_GPT2_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Workflow
1. Data Preprocessing
2. Model Training
3. Testing

Tutorial [link](https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook)

In [5]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [6]:
import numpy as np
import pandas as pd
import re

In [7]:
def cleaning(s):
    s = str(s) #turn to string
    #Remove unwanted characters
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [None]:
# df = pd.read_csv("../datasets/Articles.csv", encoding="ISO-8859-1")
# df = df.dropna()
# text_data = open('../datasets/Articles.txt', 'w', encoding="utf-8")
# # Must have encoding when write
# for idx, item in df.iterrows():
#   article = cleaning(item["Article"])
#   text_data.write(article)
# text_data.close()

In [12]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [13]:
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm = mlm
    )
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=overwrite_output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            num_train_epochs=num_train_epochs,
        )

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )
        
    trainer.train()
    trainer.save_model()

In [9]:
!wget https://raw.githubusercontent.com/AnDDoanf/learn_NLP/master/datasets/Articles.txt

--2023-03-08 14:37:17--  https://raw.githubusercontent.com/AnDDoanf/learn_NLP/master/datasets/Articles.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4721884 (4.5M) [text/plain]
Saving to: ‘Articles.txt’


2023-03-08 14:37:19 (291 MB/s) - ‘Articles.txt’ saved [4721884/4721884]



In [10]:
# run on local:
# train_file_path = "../datasets/Articles.txt"
# run on colab:
train_file_path = "Articles.txt"
model_name = 'gpt2'
output_dir = '/notebooks/GPT2_result'
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=False,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

***** Running training *****
  Num examples = 8024
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5015
  Number of trainable parameters = 124439808


Step,Training Loss
500,3.6994


Saving model checkpoint to /notebooks/GPT2_result/checkpoint-500
Configuration saved in /notebooks/GPT2_result/checkpoint-500/config.json
Configuration saved in /notebooks/GPT2_result/checkpoint-500/generation_config.json
Model weights saved in /notebooks/GPT2_result/checkpoint-500/pytorch_model.bin


In [4]:
!git init
!git config --global user.email "thuanan24@gmail.com"
!git config --global user.name "AnDDoanf"

!git remote add origin 
!git add notebooks/a_GPT_Fine_Tuning.ipynb
!git commit -m 'update GPT2 fine tuning'
!git push origin master  

Reinitialized existing Git repository in /content/.git/
usage: git remote add [<options>] <name> <url>

    -f, --fetch           fetch the remote branches
    --tags                import all tags and associated objects when fetching
                          or do not fetch any tag at all (--no-tags)
    -t, --track <branch>  branch(es) to track
    -m, --master <branch>
                          master branch
    --mirror[=(push|fetch)]
                          set up remote as a mirror to push to or fetch from

fatal: pathspec 'notebooks/a_GPT_Fine_Tuning.ipynb' did not match any files
On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.config/[m
	[31mArticles.txt[m
	[31mcached_lm_GPT2Tokenizer_128_Articles.txt[m
	[31mcached_lm_GPT2Tokenizer_128_Articles.txt.lock[m
	[31msample_data/[m

nothing added to commit but untracked files present (use "git add" to track)
error: src refspec master does not match 