## Libraries importeren

In [56]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split

## Data pre-processen

In [64]:
df = pd.read_csv("Data/medquad.csv")
df['text'] = '[QUESTION] ' + df['question'] + '[ANSWER] ' + df['answer'] 
df['text'] = df['text'].astype(str)
df.head()

Unnamed: 0,question,answer,source,focus_area,text
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma,[QUESTION] What is (are) Glaucoma ?[ANSWER] Gl...
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma,[QUESTION] What causes Glaucoma ?[ANSWER] Near...
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma,[QUESTION] What are the symptoms of Glaucoma ?...
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma,[QUESTION] What are the treatments for Glaucom...
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma,[QUESTION] What is (are) Glaucoma ?[ANSWER] Gl...


#### Train- en test data maken

In [66]:
train_data, test_data = train_test_split(df, test_size=0.2)
print(len(train_data))
print(len(test_data))

13129
3283


#### Laad het model en de tokenizer

In [67]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

#### Data tokenizen en omzetten naar hugging face dataset

In [68]:
def tokenize_data(data):
    tokenizer.pad_token = tokenizer.eos_token
    tokens = tokenizer(list(data['text']), padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    return {
        "input_ids": tokens["input_ids"].squeeze(),
        "attention_mask": tokens["attention_mask"].squeeze()
    }

train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=["text"])
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_data, batched=True, remove_columns=["text"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map: 100%|██████████| 13129/13129 [00:31<00:00, 415.18 examples/s]
Map: 100%|██████████| 3283/3283 [00:08<00:00, 394.34 examples/s]


## Model trainen

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    seed=42
)

trainer = Trainer(
    model = model,
    args= training_args,
    train_dataset=train_data,
    eval_dataset=test_data
)

trainer.train()


KeyboardInterrupt

