In [1]:
import os
import sys
sys.path.insert(0, "/data/zeljko/projects/opengpt/")

os.environ['HF_DATASETS_CACHE'] = "/data/zeljko/.cache/huggingface"
os.environ['TRANSFORMERS_CACHE'] = "/data/zeljko/.cache/huggingface"

%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
import pickle
import pandas as pd
import datasets


from opengpt.config import Config
from opengpt.model_utils import add_tokens_to_model_and_tokenizer
from opengpt.dataset_utils import create_labels, pack_examples
from opengpt.data_collator import DataCollatorWithPadding

2023-05-08 18:13:35.960625: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
config = Config(yaml_path='../configs/example_train_config.yaml')
model = AutoModelForCausalLM.from_pretrained(config.train.model)
tokenizer = AutoTokenizer.from_pretrained(config.train.model)
tokenizer.model_max_length = config.train.max_seq_len

In [5]:
add_tokens_to_model_and_tokenizer(config, tokenizer, model)



### Load data

In [8]:
train_dataset = datasets.Dataset.from_csv(config.train.datasets)
if config.train.shuffle_dataset:
    train_dataset = train_dataset.shuffle()
    print("Shuffling dataset!")

Shuffling dataset


#### Remove all columns that we do not need, filtering of the dataset can be done before removal if needed

In [9]:
# Remove everything but text
to_remove = list(train_dataset.column_names)
to_remove.remove('text')
train_dataset = train_dataset.remove_columns(to_remove)

In [11]:
# Ignore max_seq_len warning, it is handled by the packer or data_collator
train_dataset = train_dataset.map(
    lambda examples: tokenizer(examples['text'], add_special_tokens=False), 
    batched=True, 
    num_proc=1, 
    remove_columns=["text"])
# Create labels
train_dataset = train_dataset.map(
    lambda examples: create_labels(examples, config, tokenizer),
    batched=True,
    batch_size=1000,
    num_proc=1,
)
# We only do packing for the train set
train_dataset = train_dataset.map(
    lambda examples: pack_examples(examples, config.train.max_seq_len, packing_type=config.train.packing_type),
    batched=True,
    batch_size=1000,
    num_proc=1,
)

Map:   0%|          | 0/29660 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(**config.train.hf_training_arguments.to_dict())
dc = DataCollatorWithPadding(tokenizer.pad_token_id, config.train.ignore_index, max_seq_len=config.train.max_seq_len)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,
    data_collator=dc,
)

In [16]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8771
})

In [17]:
trainer.train()



Step,Training Loss
100,1.6505
200,1.4897
300,1.4459
400,1.4165
500,1.3998


TrainOutput(global_step=548, training_loss=1.4748950457050853, metrics={'train_runtime': 388.7346, 'train_samples_per_second': 22.563, 'train_steps_per_second': 1.41, 'total_flos': 1931665648896000.0, 'train_loss': 1.4748950457050853, 'epoch': 1.0})

# Test Generation

In [18]:
gen = pipeline(model=model, tokenizer=tokenizer, task='text-generation', device=model.device)

In [19]:
t = "<|user|> What is diabetes? <|eos|> <|ai|>" # The format with special tokens is required, because of training

In [39]:
# Temperature is important, and depending on your model different values will be good (this one is for gpt-2)
print(gen(t, do_sample=True, max_length=128, temperature=0.2)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50267 for open-end generation.


<|user|> What is diabetes? <|eos|> <|ai|> Diabetes is a condition in which the body's insulin levels are too low, which can lead to high blood sugar levels.
References:
- https://www.nhs.uk/conditions/diabetes/ 
