In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # Gebruik pipeline van imblearn voor SMOTE integratie

# Data inladen
data = pd.read_json('../../dataset.json')

# Pas data aan naar nieuwe kolomnaam
data['service'] = data['visit'].apply(lambda x: x['service'])

# Feature engineering
data['log_distance'] = np.log(data['distance'] + 1)  # Normalisatie van de afstand
data['log_driveTime'] = np.log(data['driveTime'] + 1)  # Normalisatie van rijtijd

# Selecteer features en target
features = ['rijksregisterNurse', 'log_distance', 'log_driveTime']
X = data[features]
y = data['service'].astype('category').cat.codes  # Converteer service types naar numerieke codes

# Train en test sets splitsen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Column transformer voor het toepassen van verschillende preprocessing op verschillende kolommen
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['log_distance', 'log_driveTime']),
        ('cat', OneHotEncoder(), ['rijksregisterNurse'])
    ])

# Pipeline voor preprocessing en model
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # Voeg SMOTE toe om de dataset te balanceren
    ('gbm', GradientBoostingClassifier(random_state=0))
])

# Hyperparameter tuning
param_grid = {
    'gbm__n_estimators': [100, 200, 300],
    'gbm__learning_rate': [0.01, 0.05, 0.1],
    'gbm__max_depth': [3, 4, 5],
    'gbm__subsample': [0.8, 0.9, 1.0]  # Voeg subsample toe voor stochastic gradient boosting
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Beste model
best_model = grid_search.best_estimator_

# Predicties en evaluatie
predictions = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))


Fitting 5 folds for each of 81 candidates, totalling 405 fits


Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 480, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anacond