In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Data inladen
data = pd.read_json('../../dataset.json')

# Extraheer de benodigde informatie uit geneste structuur
data['visitAmounts'] = data['visit'].apply(lambda x: x['visitAmounts'])
data['service'] = data['visit'].apply(lambda x: x['service'])

# Normaliseer afstand en rijtijd
data['log_distance'] = np.log(data['distance'] + 1)
data['log_driveTime'] = np.log(data['driveTime'] + 1)

# Statistieken voor dynamische drempels
average_visits = data['visitAmounts'].mean()
std_dev_visits = data['visitAmounts'].std()
high_risk_threshold = average_visits + std_dev_visits
medium_risk_threshold = average_visits

# Fraud detection op basis van dynamische drempels
def assign_fraud_risk_based_on_visits(row):
    if row['visitAmounts'] > high_risk_threshold:
        return 'High Risk'
    elif row['visitAmounts'] > medium_risk_threshold:
        return 'Medium Risk'
    else:
        return 'No Risk'

# Fraude risico functie toepassen
data['Fraud Risk'] = data.apply(assign_fraud_risk_based_on_visits, axis=1)
categories = pd.Categorical(data['Fraud Risk'])
data['Fraud Risk Code'] = categories.codes
category_mapping = dict(enumerate(categories.categories))

# Selecteer features en stel de target in
features = ['log_distance', 'log_driveTime', 'visitAmounts']
X = data[features]
y = data['Fraud Risk Code']

# Train en test sets splitsen
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, data.index, test_size=0.2, random_state=42)

# Preprocessing en model pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features)
    ])

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', GradientBoostingClassifier(random_state=0))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 4, 5],
    'classifier__subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Predicties uitvoeren
predictions = grid_search.predict(X_test)

# Bepaal de risicocode voor "Medium Risk" en "High Risk"
medium_risk_code = next(code for code, risk in category_mapping.items() if risk == 'Medium Risk')
high_risk_code = next(code for code, risk in category_mapping.items() if risk == 'High Risk')

# Gefilterde resultaten die alleen High en Medium Risk tonen
predicted_risk_indices = indices_test[np.isin(predictions, [medium_risk_code, high_risk_code])]
risk_data = data.loc[predicted_risk_indices]

# Print de gevonden risicovolle gevallen
for _, row in risk_data.iterrows():
    print(row.to_json())


Fitting 5 folds for each of 81 candidates, totalling 405 fits
{"rijksregisterPatient":93145149872,"rijksregisterNurse":79424894309,"visit":{"id":1965,"visitAmounts":10,"duration":"FIVE_MONTHS","service":"WALKING_ASSISTANCE","nurseLocation":{"latitude":49.892014,"longtitude":2.53947},"patientLocation":{"latitude":51.07179,"longtitude":5.326386}},"visit_timestamp":"2024-11-07T07:22:04Z","driveTime":46,"distance":1.981,"visitAmounts":10,"service":"WALKING_ASSISTANCE","log_distance":1.0922588147,"log_driveTime":3.8501476017,"Fraud Risk":"High Risk","Fraud Risk Code":0}
{"rijksregisterPatient":98292290159,"rijksregisterNurse":89146590450,"visit":{"id":5019,"visitAmounts":8,"duration":"TWELVE_MONTHS","service":"TEMPERATURE_MONITORING","nurseLocation":{"latitude":51.245365,"longtitude":5.330117},"patientLocation":{"latitude":49.60745,"longtitude":5.6064}},"visit_timestamp":"2024-08-29T18:06:06Z","driveTime":20,"distance":3.816,"visitAmounts":8,"service":"TEMPERATURE_MONITORING","log_distance"