In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# import
import re
import json
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, random_split
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel


## Dataset

In [None]:
# Dataset class
class BeautyDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        # define variables    
        self.input_ids = []
        self.attn_masks = []
        #self.labels = []
        
        # iterate through the dataset
        for txt, label in zip(txt_list, label_list):
            txt= txt.replace("\n", "")
            #txt_ = (txt[:256]) if len(txt) > 256 else txt

            # prepare the text
            prep_txt = f'<s> Title: {label} [SEP] Content: {txt}</s>'
            # tokenize
            encodings_dict = tokenizer(prep_txt, truncation=True,
                                       max_length=max_length, padding="max_length")
            # append to list
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            #self.labels.append(label)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [None]:
def read_content_title():
  import glob

  path = r'/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all' # folder
  all_files = glob.glob(path + "/*.csv")

  list_ = []

  for filename in all_files:
    try:
      #print(filename)
      df_t = pd.read_csv(filename, index_col=None, header=None, skiprows=1, encoding='utf-8')
      list_.append(df_t)
    except:
      print(f"reading error in{filename}")



  frame = pd.concat(list_, axis=0, ignore_index=True)
  print(f"length of data frame: {len(frame)}")
  return frame

In [None]:
# Data load function
def load_beauty_dataset(tokenizer, random_seed = 1):
    # load dataset and sample.
    #df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
    df = read_content_title()
    df = df[[0, 1]]
    df.columns = ['content', 'title']
    df = df.sample(5000, random_state=1)
    
    max_length = max([len(tokenizer.encode(description)) for description in df['content']])
    print("Max length: {}".format(max_length))

    dataset = BeautyDataset(df['content'].tolist(), df['title'].tolist(), tokenizer, max_length=512)
    dataset.__getitem__(5)

    
    train_size = int(0.9 * len(dataset))
    train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
    print(len(dataset))

    # return
    return train_dataset, val_dataset

In [None]:
from transformers import T5Tokenizer, AutoModelForCausalLM, GPT2LMHeadModel
  
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-medium", bos_token='<s>', eos_token='</s>', pad_token='<pad>')


model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 1024)

In [None]:
for trial_no in range(1):
  print("Loading dataset...")
  train_dataset, val_dataset = load_beauty_dataset(tokenizer, trial_no)

Loading dataset...
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_490820.csv
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_442040.csv
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_513726.csv
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_283249.csv
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_693263.csv
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_895475.csv
reading error in/content/drive/MyDrive/General Purpose Web Scraping Tool/rinna_testing/Corpus/beauty_newdataset/all/beauty_742192.csv
reading error in/content/drive/MyDrive/Gene

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"


5000


In [None]:
print("Start training...")
training_args = TrainingArguments(output_dir=r'/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000', num_train_epochs=5, 
                                logging_steps=5000, load_best_model_at_end=True,
                                save_strategy='steps',
                                evaluation_strategy="steps",
                                save_steps=5000,
                                per_device_train_batch_size=3, per_device_eval_batch_size=3,
                                learning_rate=0.001,
                                warmup_steps=1, weight_decay=0.0001, logging_dir='logs')


trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,
          eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                                'attention_mask': torch.stack([f[1] for f in data]),
                                                                'labels': torch.stack([f[0] for f in data])})
trainer.train()                                                  

Start training...


***** Running training *****
  Num examples = 4500
  Num Epochs = 5
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 7500
  Number of trainable parameters = 336129024


Step,Training Loss,Validation Loss
5000,0.6237,0.827514


***** Running Evaluation *****
  Num examples = 500
  Batch size = 3
Saving model checkpoint to /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/checkpoint-5000
Configuration saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/checkpoint-5000/config.json
Model weights saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/checkpoint-5000 (score: 0.8275141716003418).


TrainOutput(global_step=7500, training_loss=0.47258819580078126, metrics={'train_runtime': 9198.2277, 'train_samples_per_second': 2.446, 'train_steps_per_second': 0.815, 'total_flos': 2.089576562688e+16, 'train_loss': 0.47258819580078126, 'epoch': 5.0})

In [None]:
import os

output_dir = '/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000'

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Configuration saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/config.json
Model weights saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/added_tokens.json


('/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/tokenizer_config.json',
 '/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/special_tokens_map.json',
 '/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/spiece.model',
 '/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/added_tokens.json')

In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000")

loading configuration file /content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000/config.json
Model config GPT2Config {
  "_name_or_path": "/content/drive/MyDrive/Models/title_to_text_generation/gpt2_rinna_epoch_5_data_20000",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 1,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": 4096,
  "n_layer": 24,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
     

In [None]:
text = '【ブローチェ　アヴェダ】また、ピンクブラウンにはメリットが多いのも魅力の一つ。'
prompt = f' Title: {text}\n Content:'
generated = tokenizer(f"<s> {prompt}", return_tensors="pt").input_ids
sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=512, top_p=0.90, 
            temperature=0, num_return_sequences=1)
pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
           

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
print(pred_text)

itle: 【ブローチェ アヴェダ】また、ピンクブラウンにはメリットが多いのも魅力の一つ。 Content:】ブリーチなしでも発色しやすいピンクブラウンカラー。 ブリーチなしでも発色しやすいピンクブラウンは、ミルクティーのようなカラーに色味が変わります。 また、ピンクブラウンは赤みがかったり、透明感がでたりと、色味がマイルドなのが特徴です。 ブリーチなしでも発色しやすいピンクブラウンは、以下の記事で確認してみましょう!関連記事はこちらブリーチなしで作る?取り入れたいピンクカラーのオーダー方法とおすすめスタイル集ピンクカラーは、ピンクとベージュの定番色。 ブリーチなしでも発色しやすいピンクカラーは、今季のトレンドカラーなんです♡ 今回はそんなピンクカラーのレングス別・明るさ別に大特集!色決め手したいブリーチいらずのおすすめスタイルを紹介していきます。


ref: https://qiita.com/m__k/items/36875fedf8ad1842b729