In [None]:
import torch

print('cuda ',torch.version.cuda,
      '\ndevice ', torch.cuda.get_device_name(0))

In [None]:
!pip install transformers

In [2]:
# import 
from torch.utils.data import Dataset, random_split
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
import glob


### load model

In [3]:
seed = 42
# seed
torch.manual_seed(seed)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B").cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### load data

In [4]:
train_path = "corpus"

In [5]:
def read_content_title(train_path):
    
    # skip firt row
    path = train_path # folder
    all_files = glob.glob(path + "/*.csv")
    list_ = []

    for filename in all_files:
        df_t = pd.read_csv(filename, index_col=None, header=None, skiprows=1)
        list_.append(df_t)


    frame = pd.concat(list_, axis=0, ignore_index=True)
    print(f"length of data frame: {len(frame)}")
    return frame

In [6]:
class BeautyDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
    
        for txt, label in zip(txt_list, label_list):
            encodings_dict = tokenizer(f'<|startoftext|>Review: {txt}\nSentiment: {label}<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [7]:
df = read_content_title(train_path)
df = df[[0, 1]]
df.columns = ['content', 'title']
#df = df.sample(30000, random_state=1)

max_length = max([len(tokenizer.encode(description)) for description in df['content']])
print("Max length: {}".format(max_length))

Token indices sequence length is longer than the specified maximum sequence length for this model (2253 > 2048). Running this sequence through the model will result in indexing errors


length of data frame: 22
Max length: 2253


In [9]:
dataset = BeautyDataset(df['content'].tolist(), df['title'].tolist(), tokenizer, max_length=max_length)
dataset.__getitem__(5)

(tensor([50257, 14832,    25,  ..., 50258, 50258, 50258]),
 tensor([1, 1, 1,  ..., 0, 0, 0]))

In [10]:

train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])


In [11]:
print(len(dataset))

22


In [None]:
save_dir='./titlegen-gptj'
training_args = TrainingArguments(output_dir=save_dir, num_train_epochs=5, logging_steps=10000, save_steps=10000,
                                  per_device_train_batch_size=6, per_device_eval_batch_size=6,warmup_steps=100, fp16=True,
                                  load_best_model_at_end=True,
                                  weight_decay=0.01, logging_dir=f'./logsj')

In [None]:
trainer=Trainer(model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

trainer.train()

REF : 
https://github.com/dredwardhyde/gpt-neo-fine-tuning-example/blob/main/gpt_j_deepspeed.py

In [None]:
import os

output_dir = './gptj-title'

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)