In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
import seaborn as sns
import torch

In [2]:
data = load_dataset("avaliev/chat_doctor")

Causal language model

In [35]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokens = data.map(lambda batch: tokenizer(batch["output"]), batched=True, remove_columns=data["train"].column_names)
block_size = 128

e = None
def group_texts(examples):
    # Concatenate all texts.
    global e
    e = examples
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

labelled = tokens.map(group_texts, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████| 95588/95588 [00:38<00:00, 2452.42 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 11949/11949 [00:04<00:00, 2496.04 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 11949/11949 [00:04<00:00, 2491.64 examples/s]


In [7]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
training_args = TrainingArguments(
    output_dir="causal-qa",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=labelled["train"].take(10000),
    eval_dataset=labelled["validation"].take(2000),
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()
trainer.save_model("causal-qa-3epochs")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.6392,3.451849
2,3.4196,3.357159
3,3.3559,3.334287


TrainOutput(global_step=3750, training_loss=3.509384716796875, metrics={'train_runtime': 199.8689, 'train_samples_per_second': 150.098, 'train_steps_per_second': 18.762, 'total_flos': 979862814720000.0, 'train_loss': 3.509384716796875, 'epoch': 3.0})

In [16]:
generator = pipeline("text-generation", model="./causal-qa-3epochs")
generator("what to do if i have a headache")

Device set to use cuda:0


[{'generated_text': 'what to do if i have a headache or any other problem then i would suggest you not to worry much.  Chat Doctor.  Also if you have a similar feeling in motion or feel tired feel free to ask further questions. Hope to have been'}]