## Libraries importeren

In [1]:
import torch
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split

## Data pre-processen

In [2]:
df = pd.read_csv("Data/medquad.csv")
df['question'] = df['question'].astype(str)
df['answer'] = df['answer'].astype(str)
df['focus_area'] = df['focus_area'].astype(str)
df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


#### Train- en test data maken

In [3]:
train_data, test_data = train_test_split(df, test_size=0.2)
print(len(train_data))
print(len(test_data))

13129
3283


#### Laad het model en de tokenizer

In [4]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

component = torch.device("cuda" if torch.cuda.is_available()
                         else "cpu")
model = model.to(component)
print(f"Model runt op {component}")

Model runt op cpu


#### Data tokenizen en omzetten naar hugging face dataset

In [5]:
def tokenize_data(data):
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_questions = tokenizer(data['question'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    tokenized_answers = tokenizer(data['answer'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')

    tokenized_focus_areas = tokenizer(data['focus_area'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    
    return {
        "input_ids": tokenized_questions["input_ids"].squeeze(),
        "labels": tokenized_answers["input_ids"].squeeze(),
        "focus_area_ids": tokenized_focus_areas["input_ids"].squeeze(),
        "attention_mask": tokenized_questions["attention_mask"].squeeze()
    }
    
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=["question", "answer", "focus_area", "source"])

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_data, batched=True, remove_columns=["question", "answer", "focus_area", "source"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=data_collator)

Map:   0%|          | 0/13129 [00:00<?, ? examples/s]

Map:   0%|          | 0/3283 [00:00<?, ? examples/s]

## Model training

#### Training loop starten (doe dit alsjeblieft een non-cuda pc niet aan)

Training is vrij zwaar, ongeveer 30 minuten met een RTX 3080. Je kan ook de training eventueel overslaan en een opgeslagen model laden.

In [6]:
training_args = TrainingArguments(
    output_dir="Getrainde_modellen/HouseMD_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir="Getrainde_modellen/HouseMD_model/logs",
    logging_steps=50,
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
    )

trainer.train()
torch.save(model.state_dict(), "Getrainde_modellen/HouseMD_model/HouseMD_Model_gewichten.pth")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.391,3.207007
2,3.1924,3.061324
3,2.9981,2.97979
4,2.8702,2.977105
5,2.4203,3.025535


#### Model laden

In [6]:
model.load_state_dict(torch.load("Getrainde_modellen/HouseMD_model/HouseMD_Model_gewichten.pth",
                                 map_location=torch.device(component)))

<All keys matched successfully>

## Model evaluatie

Oprecht de meest inefficiënte code die ik heb geschreven in een tijdje, maar het werkt weetje.

In [7]:
model.eval()
correct_voorspeld = 0 
totaal_voorspeld = 0

with torch.no_grad():
    for data in tqdm(test_loader, desc="Evalueren", unit="batch"):
        
        input_ids = data["input_ids"].to(component)
        focus_area_labels = data["focus_area_ids"].to(component)

        output = model(input_ids)
        logits = output.logits
        
        focus_token_pred = logits[:, 0, :]
        predicted_tokens_ids = torch.argmax(focus_token_pred, dim=-1)

        focus_token_true = focus_area_labels[:, 0]

        correct_voorspeld += (predicted_tokens_ids  == focus_token_true).sum().item()
        totaal_voorspeld += focus_area_labels.size(0)

accuratie = correct_voorspeld / totaal_voorspeld * 100
print(f"Model accuratie op focus area: {accuratie:.2f}")

Evalueren:   0%|          | 0/52 [00:39<?, ?batch/s]


KeyboardInterrupt: 