In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers;
!pip install sentencepiece;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

In [4]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
import json
from copy import deepcopy
import random
from tqdm.notebook import tqdm

from transformers import (
    PreTrainedTokenizerFast as BaseGPT2Tokenizer,
    EncoderDecoderModel,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AdamW
)
import pickle

from drive.MyDrive.WARNING_PRIVATE_FOLDER.OSSP2AntiGPT9.ai.lib.tokenization_kobert import KoBertTokenizer
from drive.MyDrive.WARNING_PRIVATE_FOLDER.OSSP2AntiGPT9.ai.lib import tokenization_kobert
src_tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

Downloading (…)zer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


In [6]:

class GPT2Tokenizer(BaseGPT2Tokenizer):
    def build_inputs_with_special_tokens(self, token_ids, _):
        return token_ids + [self.eos_token_id]
trg_tokenizer = GPT2Tokenizer.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

In [7]:
%cd /content/drive/MyDrive/WARNING_PRIVATE_FOLDER/OSSP2AntiGPT9/ai
with open('/content/drive/MyDrive/WARNING_PRIVATE_FOLDER/OSSP2AntiGPT9/ai/no_commit/save_src_tk.pickle','rb') as f:
    src_tokenizer = pickle.load(f)



In [8]:
%cd /content
!pwd

/content
/content


## 데이터 불러오기 && 데이터셋 만들기 

In [9]:
def split_input_dict(input_dict, ratio = 0.01, seed = 42):
    split_point = int(len(input_dict['utterance']) * ratio)
    random.seed(seed)
    random.shuffle(input_dict['utterance'])
    valid_dict = deepcopy(input_dict)
    train_dict = input_dict

    valid_dict['utterance'] = input_dict['utterance'][:split_point]
    train_dict['utterance'] = input_dict['utterance'][split_point:]
    return train_dict, valid_dict

In [10]:
def read_input(path):
    with open(path, 'rb') as f:
        input_dict = json.load(f)
    train_dict,valid_dict = split_input_dict(input_dict)
    train_standard = []
    train_dialect = []
    for pairs in tqdm(train_dict['utterance']):
        train_standard.append(pairs['standard_form'])
        train_dialect.append(pairs['dialect_form'])

    valid_standard = []
    valid_dialect = []
    for pairs in tqdm(valid_dict['utterance']):
        valid_standard.append(pairs['standard_form'])
        valid_dialect.append(pairs['dialect_form'])

    
    return train_standard ,train_dialect , valid_standard , valid_dialect

In [11]:
class PairedDataset:
    def __init__(self, src_tokenizer, tgt_tokenizer,file_standard , file_dialect):
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = tgt_tokenizer
        self.data_s = file_standard
        self.data_d = file_dialect
        self.data = [[self.data_s[i],self.data_d[i]] for i in range(len(self.data_s)) ]
    def __getitem__(self, index):
        src, trg = self.data_s[index] , self.data_d[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)
        embeddings['labels'] = self.trg_tokenizer(trg, return_attention_mask=False)['input_ids']

        return embeddings

    def __len__(self):
        return len(self.data)

In [12]:
train_s ,train_d , valid_s , valid_d = read_input("/content/drive/MyDrive/WARNING_PRIVATE_FOLDER/OSSP2AntiGPT9/ai/no_commit/real_data.json")
dataset = PairedDataset(src_tokenizer, trg_tokenizer, train_d , train_s)
eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, valid_d , valid_s)

  0%|          | 0/1220349 [00:00<?, ?it/s]

  0%|          | 0/12326 [00:00<?, ?it/s]

## 사전학습 모델 불러오기

In [13]:
model = EncoderDecoderModel.from_pretrained('leadawon/ossp-v0_2')
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(12291, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

## 학습 코드

In [14]:
!pwd

/content


In [15]:
collator = DataCollatorForSeq2Seq(src_tokenizer, model)

arguments = Seq2SeqTrainingArguments(
    output_dir='ossp-v0_3',
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    num_train_epochs=4,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    warmup_ratio=0.1,
    gradient_accumulation_steps=1,
    save_total_limit=1,
    dataloader_num_workers=1,
    fp16=True,
    load_best_model_at_end=True,
    push_to_hub=True,
    learning_rate = 5e-5
)

trainer = Seq2SeqTrainer(
    model,
    arguments,
    data_collator=collator,
    train_dataset=dataset,
    eval_dataset=eval_dataset
)

Cloning https://huggingface.co/leadawon/ossp-v0_3 into local empty directory.


In [16]:
trainer.train()

model.save_pretrained("ossp-v0_3/best_model")



Step,Training Loss,Validation Loss
10000,0.3999,0.407919
20000,0.4441,0.455541
30000,0.4361,0.437814
40000,0.4302,0.425534
50000,0.4392,0.407628
60000,0.3714,0.40063
70000,0.3694,0.390813
80000,0.3591,0.380976
90000,0.3594,0.376197
100000,0.3567,0.366661




In [17]:
trainer.push_to_hub()

Upload file runs/Apr15_07-08-36_7aeb1107af9b/events.out.tfevents.1681542544.7aeb1107af9b.2538.0:  41%|####1   …

To https://huggingface.co/leadawon/ossp-v0_3
   6996d30..2735f58  main -> main

   6996d30..2735f58  main -> main

To https://huggingface.co/leadawon/ossp-v0_3
   2735f58..590ea18  main -> main

   2735f58..590ea18  main -> main



'https://huggingface.co/leadawon/ossp-v0_3/commit/2735f58d0956308850a445d4dcdf4c64c53cf949'