## Libraries importeren

In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from Google_drive.drive_manager import authorize, upload_naar_drive, download_bestand

## Data pre-processen

In [2]:
df = pd.read_csv("Data/medquad.csv")
df['question'] = df['question'].astype(str)
df['answer'] = df['answer'].astype(str)

df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


#### Train- en test data maken

In [3]:
train_data, test_data = train_test_split(df, test_size=0.2)
print(len(train_data))
print(len(test_data))

13129
3283


#### Laad het model en de tokenizer

In [6]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

component = torch.device("cuda" if torch.cuda.is_available()
                         else "cpu")
model = model.to(component)
print(f"Model runt op {component}")

Model runt op cpu


#### Data tokenizen en omzetten naar hugging face dataset

In [7]:
def tokenize_data(data):
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_questions = tokenizer(data['question'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    tokenized_answers = tokenizer(data['answer'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    
    return {
        "input_ids": tokenized_questions["input_ids"].squeeze(),
        "labels": tokenized_answers["input_ids"].squeeze(),
        "attention_mask": tokenized_questions["attention_mask"].squeeze()
    }
    
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=["question", "answer"])

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_data, batched=True, remove_columns=["question", "answer"])
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Map:   0%|          | 0/13129 [00:00<?, ? examples/s]

Map:   0%|          | 0/3283 [00:00<?, ? examples/s]

## Model training

#### Training loop starten (doe dit alsjeblieft een non-cuda pc niet aan)

Training is vrij zwaar, ongeveer 30 minuten met een RTX 3080. Je kan ook de training eventueel overslaan en een opgeslagen model laden.

In [None]:
training_args = TrainingArguments(
    output_dir="Getrainde_modellen/HouseMD_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir="Getrainde_modellen/HouseMD_model/logs",
    logging_steps=50,
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
    )

trainer.train()
torch.save(model.state_dict(), "Getrainde_modellen/HouseMD_model/HouseMD_Model_gewichten.pth", _use_new_zipfile_serialization=True)

Epoch,Training Loss,Validation Loss
1,2.9437,3.199668
2,2.8392,3.007763
3,2.8068,2.931106
4,2.6825,2.916143
5,2.4354,2.971014


#### Model uploaden naar drive (omdat het een groot bestand is)

In [3]:
drive = authorize()

upload_naar_drive(drive=drive, path="Getrainde_modellen/HouseMD_model/HouseMD_Model_gewichten.pth", 
                  remote_name="HouseMD_Model_gewichten.pth")

Credentials gevonden. Autorisatie succesvol.
Succesvol verbonden met Google Drive!
Bestand 'HouseMD_Model_gewichten.pth' bestaat al. Bestand wordt overschreven...
Bestand 'HouseMD_Model_gewichten.pth' geüpload naar Google Drive met ID: 1DenjgGSXCU-rLcjkgxCFRZCTYyDeNuf-


#### Model laden

In [7]:
model.load_state_dict(torch.load("Getrainde_modellen/HouseMD_model/HouseMD_Model_gewichten.pth",
                                 map_location=torch.device(component)))

<All keys matched successfully>

## Model evalueren