# Init

In [1]:
# Import utils
import numpy as np
import pandas as pd
import math
import time
import json
import pyreadr
import pickle
from joblib import dump, load, Parallel, delayed
import os
import copy
import datetime as dt
from tqdm import tqdm


# Import ML models
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from dddex.levelSetKDEx_univariate import LevelSetKDEx, LevelSetKDEx_NN
from dddex.wSAA import RandomForestWSAA, SampleAverageApproximation

# Weights & Biases
import wandb

# Import Gurobi
import gurobipy as gp
from gurobipy import GRB

# Optimization Module
from DataDrivenPatientScheduling import WeightsModel
from DataDrivenPatientScheduling import Experiment

In [2]:
# Setup the experiment
experiment_setup = dict(

    # Paths
    path_data = '/home/fesc/dddex/PatientScheduling/Data',
    path_results = '/home/fesc/dddex/PatientScheduling/Data/Results',

    ## Optimization
    optimization_params = dict(
    
        # Gurobi params
        gurobi_params = {

            'LogToConsole': 0, 
            'Threads': 2

        },

        # Cost params
        cost_params = [

            {'CR': 0.10, 'c_waiting_time': 1, 'c_overtime': 9},
            {'CR': 0.25, 'c_waiting_time': 1, 'c_overtime': 3},
            {'CR': 0.50, 'c_waiting_time': 1, 'c_overtime': 1},
            {'CR': 0.75, 'c_waiting_time': 3, 'c_overtime': 1},
            {'CR': 0.90, 'c_waiting_time': 9, 'c_overtime': 1}

        ],

        # Number of scenarios
        K = 10**3,
        
        # Time budget multiplier
        alpha = 1.25,

        # n parallel jobs
        n_jobs = 32
        
    )
)

# Make all experiment variables visible locally
locals().update(experiment_setup)

# Model LSx - LGBM

In [None]:
# Setup the model
model_setup = dict(

    # Model
    model_name = 'LSx_LGBM',
        
    ## Point estimator
    point_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 4,
            'verbose': -1
        },

        # Tuning meta params
        tuning_params = {     
            'n_iter': 200,
            'scoring': {'MSE': 'neg_mean_squared_error'},
            'return_train_score': True,
            'refit': 'MSE',
            'random_state': 12345,
            'n_jobs': 8,
            'verbose': 0
        },    

        # Hyper params search grid
       hyper_params_grid = {
            'num_leaves': [x for x in range(5, 500, 5)],
            'max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_child_samples': [x for x in range(5, 500, 5)],
            'learning_rate': [x/100 for x in range(5, 25, 5)],
            'n_estimators': [100, 200, 300, 400, 500],
            'subsample': [x/100 for x in range(5, 100, 5)],
            'colsample_bytree': [x/100 for x in range(5, 100, 5)]
        },  
    
    
    ),
          
 
    ## Density estimator
    density_estimator_params = dict(
    
        # Meta parameters
        model_params = {
            'weightsByDistance': False
        },

        # Tuning meta params
        tuning_params = {     
            #'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'probs': sorted(list(set(np.concatenate([np.array([0.005, 0.025, 0.165, 0.25, 0.50, 0.835, 0.75, 0.975, 0.995]), 
                                                     np.arange(1, 100, 1) / 100])))),
            'n_jobs': 8,
        },    

        # Hyper params search grid
        hyper_params_grid = {
            'binSize': [x for x in range(25, 500, 25)]
        },
        
    )
)

# Make all model variables visible locally
locals().update(model_setup)

## Preprocessing

In [None]:
# Load data
Y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/Y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

In [None]:
# Train-test split
y_train = np.array(Y_data.loc[ID_data.train_test == 'train']).flatten()
X_train = np.array(X_data.loc[ID_data.train_test == 'train'])
ID_train = ID_data.loc[ID_data.train_test == 'train']

In [None]:
# CV folds
tscv = TimeSeriesSplit(n_splits=3)
cv_folds = tscv.split(range(len(ID_train)))
cv_folds = list(cv_folds)

In [None]:
# Initialize modules
weightsmodel = WeightsModel()
experiment = Experiment()

## Tune point estimator

In [None]:
# Update params
locals().update(point_estimator_params)

# Point estimator
point_estimator = LGBMRegressor(**model_params)

# Tune point estimator
point_estimator = weightsmodel.tune_point_estimator(X_train, y_train, point_estimator, cv_folds, hyper_params_grid, 
                                                    tuning_params, random_search=True, print_time=True)

## Tune density estimator

In [None]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = LevelSetKDEx(estimator = point_estimator, **model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, hyper_params_grid, 
                                                        tuning_params, random_search=False, print_time=True)

## Data-driven optimization

In [None]:
# Update params
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(Y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        weightsModel = density_estimator,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        K = K,
        alpha = alpha,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+"/"+model_name+"_K"+str(K)+".csv", sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

# Model: LSx NN - LGBM

In [None]:
# Setup the model
model_setup = dict(
    
    # Model
    model_name = 'LSx_LGBM_NN',
        
    ## Point estimator
    point_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 4,
            'verbose': -1
        },

        # Tuning meta params
        tuning_params = {     
            'n_iter': 200,
            'scoring': {'MSE': 'neg_mean_squared_error'},
            'return_train_score': True,
            'refit': 'MSE',
            'random_state': 12345,
            'n_jobs': 8,
            'verbose': 0
        },    

        # Hyper params search grid
       hyper_params_grid = {
            'num_leaves': [x for x in range(5, 500, 5)],
            'max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_child_samples': [x for x in range(5, 500, 5)],
            'learning_rate': [x/100 for x in range(5, 25, 5)],
            'n_estimators': [100, 200, 300, 400, 500],
            'subsample': [x/100 for x in range(5, 100, 5)],
            'colsample_bytree': [x/100 for x in range(5, 100, 5)]
        },  
    
    
    ),
          
 
    ## Density estimator
    density_estimator_params = dict(
    
        # Meta parameters
        model_params = {
            'weightsByDistance': True
        },

        # Tuning meta params
        tuning_params = {     
            #'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'probs': sorted(list(set(np.concatenate([np.array([0.005, 0.025, 0.165, 0.25, 0.50, 0.835, 0.75, 0.975, 0.995]), 
                                                     np.arange(1, 100, 1) / 100])))),
            'n_jobs': 8,
        },    

        # Hyper params search grid
        hyper_params_grid = {
            'binSize': [x for x in range(25, 500, 25)]
        },
        
    )
)

# Make all model variables visible locally
locals().update(model_setup)

## Preprocessing

In [None]:
# Load data
Y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/Y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

In [None]:
# Train-test split
y_train = np.array(Y_data.loc[ID_data.train_test == 'train']).flatten()
X_train = np.array(X_data.loc[ID_data.train_test == 'train'])
ID_train = ID_data.loc[ID_data.train_test == 'train']

In [None]:
# CV folds
tscv = TimeSeriesSplit(n_splits=3)
cv_folds = tscv.split(range(len(ID_train)))
cv_folds = list(cv_folds)

In [None]:
# Initialize modules
weightsmodel = WeightsModel()
experiment = Experiment()

## Tune point estimator

In [None]:
# Update params
locals().update(point_estimator_params)

# Point estimator
point_estimator = LGBMRegressor(**model_params)

# Tune point estimator
point_estimator = weightsmodel.tune_point_estimator(X_train, y_train, point_estimator, cv_folds, hyper_params_grid, 
                                                    tuning_params, random_search=True, print_time=True)

## Tune density estimator

In [None]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = LevelSetKDEx_NN(estimator = point_estimator, **model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, hyper_params_grid, 
                                                        tuning_params, random_search=False, print_time=True)

## Data-driven optimization

In [None]:
# Update params
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(Y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        weightsModel = density_estimator,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        K = K,
        alpha = alpha,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+"/"+model_name+"_K"+str(K)+".csv", sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

# Model: wSAA - RF

In [None]:
# Setup the model
model_setup = dict(

    # Model
    model_name = 'wSAA_RF',
        
    ## Density estimator
    density_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 4,
            'verbose': 0
        },

        # Tuning meta params
        tuning_params = {     
            #'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'probs': sorted(list(set(np.concatenate([np.array([0.005, 0.025, 0.165, 0.25, 0.50, 0.835, 0.75, 0.975, 0.995]), 
                                                     np.arange(1, 100, 1) / 100])))),
            'nIter': 100,
            'random_state': 12345,
            'n_jobs': 16
        },    

       # Hyper params search grid
       hyper_params_grid = {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [1, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_samples_split': [10, 20, 40, 80, 160, 320],
            'min_samples_leaf': [10, 20, 40, 80, 160, 320],
            'max_features': [x/100 for x in range(5, 100, 5)],
            'max_leaf_nodes': [10, 20, 40, 80, 160, 320],
            'min_impurity_decrease': [0.0, 0.01, 0.02, 0.3, 0.04, 0.05],
            'bootstrap': [True],
            'max_samples': [0.80, 0.85, 0.90, 0.95, 1.00]
        },          
    )
)

# Make all model variables visible locally
locals().update(model_setup)

## Preprocessing

In [None]:
# Load data
Y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/Y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

In [None]:
# Train-test split
y_train = np.array(Y_data.loc[ID_data.train_test == 'train']).flatten()
X_train = np.array(X_data.loc[ID_data.train_test == 'train'])
ID_train = ID_data.loc[ID_data.train_test == 'train']

In [None]:
# CV folds
tscv = TimeSeriesSplit(n_splits=3)
cv_folds = tscv.split(range(len(ID_train)))
cv_folds = list(cv_folds)

In [None]:
# Initialize modules
weightsmodel = WeightsModel()
experiment = Experiment()

## Tune density estimator

In [None]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = RandomForestWSAA(**model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, hyper_params_grid, 
                                                        tuning_params, random_search=True, print_time=True)

## Data-driven optimization

In [None]:
# Update params
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(Y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        weightsModel = density_estimator,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        K = K,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+"/"+model_name+"_K"+str(K)+".csv", sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

# Model: SAA

In [None]:
# Setup the model
model_setup = dict(

    # Model
    model_name = 'SAA'

)

# Make all model variables visible locally
locals().update(model_setup)

## Preprocessing

In [None]:
# Load data
Y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/Y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

In [None]:
# Initialize modules
experiment = Experiment()

## Data-driven optimization

In [None]:
# Update params
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(Y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        weightsModel = SampleAverageApproximation(),
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        K = K,
        alpha = alpha,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+"/"+model_name+"_K"+str(K)+".csv", sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

# Evaluate

In [None]:
K = 10

# Load results
results_LSx_LGBM = pd.read_csv(path_results+'/LSx_LGBM_K'+str(K)+'.csv')
results_LSx_LGBM['model'] = 'LSx_LGBM'

results_LSx_LGBM_NN = pd.read_csv(path_results+'/LSx_LGBM_NN_K'+str(K)+'.csv')
results_LSx_LGBM_NN['model'] = 'LSx_LGBM_NN'

#results_wSAA_RF = pd.read_csv(path_results+'/wSAA_RF_K'+str(K)+'.csv')
#results_wSAA_RF['model'] = 'wSAA_RF'

results_SAA = pd.read_csv(path_results+'/SAA_K'+str(K)+'.csv')
results_SAA['model'] = 'SAA'

In [None]:
# Combine
results = pd.concat([
    results_LSx_LGBM[['model', 'date', 'area', 'CR', 'cost', 'overtime', 'waiting_time']],
    results_LSx_LGBM_NN[['model', 'date', 'area', 'CR', 'cost', 'overtime', 'waiting_time']] #,
    #results_wSAA_RF[['model', 'date', 'area', 'CR', 'cost', 'overtime', 'waiting_time']]
])

results = pd.merge(
    results, 
    results_SAA[['date', 'area', 'CR', 'cost', 'overtime', 'waiting_time']],
    on=['date', 'area', 'CR'],
    suffixes=('', '_SAA')
)

In [None]:
# Coefficient of prescriptiveness
results['pq'] = results.cost / results.cost_SAA
results.loc[results.cost == results.cost_SAA, 'pq'] = 1

In [None]:
# Save results
results.to_csv(path_results+"/results_summary_K1000.csv", sep=',', index=False)

In [None]:
# Analysze
results.groupby(['CR', 'area', 'model']).agg(median_pq=('pq', np.median)).reset_index().to_markdown()

In [None]:
# Analysze
results.loc[results.CR == 0.50].groupby(['CR', 'area', 'model']).agg(median_pq=('pq', np.median)).reset_index()

In [None]:
results_SAA.loc[(results_SAA.CR == 0.50) & (results_SAA.n_patients > 5)].groupby('area').agg({'cost': sum})

In [None]:
results_LSx_LGBM.loc[(results_LSx_LGBM.CR == 0.50) & (results_LSx_LGBM.n_patients > 5)].groupby('area').agg({'cost': sum})

# Debugging

In [None]:
#### ... 
def predict_quantiles(X, y, date, dates, areas, weightsModel = None, 
             quantiles = [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.900, 0.975, 0.995],
             print_status = False):



    """

    ...

    Arguments:

        X: ...
        y: ...
        date: ...
        dates: ...
        areas: ...
        weightsModel = None: ...
        quantiles = [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.900, 0.975, 0.995]: ...
        print_status = True: ...

    Returns:

        results(pd.DataFrame): ...


    """

    # Train-test split
    y_train, y_test = y[dates < date].flatten(), y[dates == date].flatten()
    X_train, X_test = X[dates < date], X[dates == date]
    dates_train, dates_test = dates[dates < date], dates[dates == date]
    areas_train, areas_test = areas[dates < date], areas[dates == date]

    # Fit weights model
    weightsModel.fit(X_train, y_train)

    # Initialize
    results = {}

    # For each area 
    for area in list(set(areas_test)):

        # Select test data for current area
        y_test_ = y_test[area == areas_test]
        X_test_ = X_test[area == areas_test]

        # Fit SAA on data of current area
        SAA = SampleAverageApproximation()
        SAA.fit(y_train[area == areas_train])

        # Predict quantiles for each patient
        q_hat = weightsModel.predict(X_test_, probs=quantiles, outputAsDf=False)
        q_hat_saa = SAA.predict(X_test_, probs=quantiles, outputAsDf=False)

        # Add to results
        results[area] = pd.merge(
            left = pd.concat(pd.DataFrame({'prob': p, 'q_hat': q}) for p, q in q_hat.items()).reset_index(names='j'),
            right = pd.concat(pd.DataFrame({'prob': p, 'q_hat_saa': q, 'y': y_test_}) for p, q in q_hat_saa.items()).reset_index(names='j'),
            on = ['prob', 'j']
        )
        
    # Finalize
    results = pd.concat(results).reset_index(names=['area', '']).drop(columns='')

    return results

In [None]:
# Update params
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(predict_quantiles)(
        X = np.array(X_data),
        y = np.array(Y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        weightsModel = density_estimator,
        quantiles = [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
        print_status = False) for date in dates)

# Finalize
results = pd.concat(results, keys=dates).reset_index(names=['date', '']).drop(columns='')

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

In [None]:
# Scaled Pinball Loss
def scaled_pinball_loss(p, q, q_saa, y, **kwargs):

    """

    ...

    """
    q = np.array(q).flatten()
    q_saa = np.array(q_saa).flatten()
    y = np.array(y).flatten()
    

    # Pinball Loss Model
    pl = np.mean((y - q) * p * (q <= y) + (q - y) * (1 - p) * (q > y))

    # Pinball Loss SAA
    pl_saa = np.mean((y - q_saa) * p * (q_saa <= y) + (q_saa - y) * (1 - p) * (q_saa > y))

    # Scaled Pinball Loss
    with np.errstate(divide='ignore'):
        spl = (pl == pl_saa) * 1.0 + (pl != pl_saa) * (pl / pl_saa)

    return spl

In [None]:
spl = results.groupby(['area', 'prob']).apply(
    
    lambda df: pd.Series({'spl': scaled_pinball_loss(p=df.prob, q=df.q_hat, q_saa=df.q_hat_saa, y=df.y)
    
    })

).reset_index()

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Number of cases per day by treatment
plotData = spl.groupby(['area'])

# Plot
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)

for area, d in plotData:
    ax.plot(d['prob'], d['spl'], marker='', linestyle='-', ms=2, linewidth=2, label=area)
plt.axhline(y = 1, color = 'grey', linestyle = '--', linewidth=1)
ax.legend()
ax.get_xaxis().set_visible(False)
plt.show()

## LSx LGBM

In [3]:
# Setup the model
model_setup = dict(

    # Model
    model_name = 'LSx_LGBM',
        
    ## Point estimator
    point_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 4,
            'verbose': -1
        },

        # Tuning meta params
        tuning_params = {     
            'n_iter': 200,
            'scoring': {'MSE': 'neg_mean_squared_error'},
            'return_train_score': True,
            'refit': 'MSE',
            'random_state': 12345,
            'n_jobs': 8,
            'verbose': 0
        },    

        # Hyper params search grid
       hyper_params_grid = {
            'num_leaves': [x for x in range(5, 500, 5)],
            'max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_child_samples': [x for x in range(5, 500, 5)],
            'learning_rate': [x/100 for x in range(5, 25, 5)],
            'n_estimators': [100, 200, 300, 400, 500],
            'subsample': [x/100 for x in range(5, 100, 5)],
            'colsample_bytree': [x/100 for x in range(5, 100, 5)]
        },  
    
    
    ),
          
 
    ## Density estimator
    density_estimator_params = dict(
    
        # Meta parameters
        model_params = {
            'weightsByDistance': False
        },

        # Tuning meta params
        tuning_params = {     
            #'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'probs': sorted(list(set(np.concatenate([np.array([0.005, 0.025, 0.165, 0.25, 0.50, 0.835, 0.75, 0.975, 0.995]), 
                                                     np.arange(1, 100, 1) / 100])))),
            'n_jobs': 8,
        },    

        # Hyper params search grid
        hyper_params_grid = {
            'binSize': [x for x in range(25, 500, 25)]
        },
        
    )
)

# Make all model variables visible locally
locals().update(model_setup)

## Preprocessing

In [4]:
# Load data
Y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/Y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

In [5]:
# Train-test split
y_train = np.array(Y_data.loc[ID_data.train_test == 'train']).flatten()
X_train = np.array(X_data.loc[ID_data.train_test == 'train'])
ID_train = ID_data.loc[ID_data.train_test == 'train']

In [6]:
# CV folds
tscv = TimeSeriesSplit(n_splits=3)
cv_folds = tscv.split(range(len(ID_train)))
cv_folds = list(cv_folds)

In [7]:
# Initialize modules
weightsmodel = WeightsModel()
experiment = Experiment()

## Tune point estimator

In [8]:
# Update params
locals().update(point_estimator_params)

# Point estimator
point_estimator = LGBMRegressor(**model_params)

# Tune point estimator
point_estimator = weightsmodel.tune_point_estimator(X_train, y_train, point_estimator, cv_folds, hyper_params_grid, 
                                                    tuning_params, random_search=True, print_time=True)

Time: 0:00:08
>> Execution time: 8.0 seconds
>> CPU time: 1.0 seconds


## Tune density estimator

In [9]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = LevelSetKDEx(estimator = point_estimator, **model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, hyper_params_grid, 
                                                        tuning_params, random_search=False, print_time=True)

Time: 0:00:03
>> Execution time: 3.0 seconds
>> CPU time: 0.0 seconds


## Data-driven optimization

In [10]:
# Update params
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

date = min(dates)

results = experiment.run(
    X = np.array(X_data),
    y = np.array(Y_data),
    date = date,
    dates = ID_data['date'],
    areas = ID_data['area'],
    weightsModel = density_estimator,
    cost_params = cost_params,
    gurobi_params = gurobi_params,
    K = K,
    alpha = alpha,
    print_status = False)

In [None]:
results

In [17]:
## in depth

In [149]:
X = np.array(X_data)
y = np.array(Y_data)
date = date
dates = ID_data['date']
areas = ID_data['area']
#weightsModel = density_estimator

weightsModel = SampleAverageApproximation()


cost_params = cost_params
gurobi_params = gurobi_params
K = 10**4
alpha = 2
print_status = False

In [150]:
# Train-test split
y_train, y_test = y[dates < date].flatten(), y[dates == date].flatten()
X_train, X_test = X[dates < date], X[dates == date]
dates_train, dates_test = dates[dates < date], dates[dates == date]
areas_train, areas_test = areas[dates < date], areas[dates == date]

# Get time budget per area
hist_durations = pd.DataFrame({
    'duration': y_train, 
    'area': areas_train,
}).groupby('area').agg(
    median_duration=('duration', np.median)
).reset_index()

hist_durations = dict(zip(hist_durations.area, hist_durations.median_duration))

# LSx and wSAA
if not str(type(weightsModel)) == "<class 'dddex.wSAA.SampleAverageApproximation'>":

    # Fit weights model
    weightsModel.fit(X_train, y_train)

In [151]:
# Initialize
results = pd.DataFrame()

area = 'Bereich_Gastroskopie'


# Initialize
results_ = {}

# Progress
if print_status:
    print('=====================================================================================================')
    print('# Area:',area)

# Timer
weights_st_exec = time.time()
weights_st_cpu = time.process_time() 

# Select test data for current area
y_test_ = y_test[area == areas_test]
X_test_ = X_test[area == areas_test]

# Number of patient cases to schedule
M = len(y_test_)

# Set random sequence of patient cases
#patient_sequence = np.arange(M)
#np.random.shuffle(patient_sequence)

#y_test_ = y_test_[patient_sequence]
#X_test_ = X_test_[patient_sequence]

# Set time budget (based on median duration per area x number of cases on test day)
T = hist_durations[area] * M * alpha

# SAA
if str(type(weightsModel)) == "<class 'dddex.wSAA.SampleAverageApproximation'>":

    # Fit SAA on data of current area
    weightsModel.fit(y_train[area == areas_train])

# Get estimated conditional distribution
conditionalDistribution = weightsModel.getWeights(X_test_, outputType='summarized')

# Timer
weights_exec_time_sec = time.time()-weights_st_exec
weights_cpu_time_sec = time.process_time()-weights_st_cpu


In [152]:
# Generate K scenarios
scenarios = []

# Timer
scenarios_st_exec = time.time()
scenarios_st_cpu = time.process_time() 

# Draw
for k in range(K):    

    # For each patient
    scenario = []
    for j in range(M):

        # Weighted samples
        weights = conditionalDistribution[j][0]
        samples = conditionalDistribution[j][1]

        # Add scenario for patient j
        scenario += [np.random.choice(samples.flatten(), p=weights.flatten())]

    # Add scenarios
    scenarios += [scenario]

# Scenarios
scenarios = np.array(scenarios)   

# Timer
scenarios_exec_time_sec = time.time()-scenarios_st_exec
scenarios_cpu_time_sec = time.process_time()-scenarios_st_cpu



In [153]:
j

16

In [154]:
weights

array([0.0021645 , 0.00154607, 0.00309215, 0.00896722, 0.01948052,
       0.03463203, 0.04421769, 0.05009276, 0.0615337 , 0.06246135,
       0.08565244, 0.05998763, 0.06524428, 0.0445269 , 0.05040198,
       0.05163884, 0.02813853, 0.02628324, 0.02380952, 0.0170068 ,
       0.03741497, 0.01453309, 0.0126778 , 0.0170068 , 0.00711194,
       0.01638837, 0.00927644, 0.00927644, 0.00618429, 0.00742115,
       0.02380952, 0.00803958, 0.00463822, 0.00525665, 0.00371058,
       0.00618429, 0.00340136, 0.00401979, 0.00247372, 0.00494743,
       0.00401979, 0.0021645 , 0.00401979, 0.00247372, 0.00247372,
       0.00340136, 0.00092764, 0.00123686, 0.00061843, 0.00123686,
       0.00309215, 0.00030921, 0.00185529, 0.00092764, 0.00030921,
       0.00061843, 0.00061843, 0.00061843, 0.00030921, 0.00092764,
       0.00309215, 0.00061843, 0.00092764, 0.00092764, 0.00154607,
       0.00061843, 0.00061843, 0.00030921, 0.00092764, 0.00092764,
       0.00092764, 0.00030921, 0.00030921, 0.00092764, 0.00030

In [155]:
samples

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  68.,  69.,  70.,  71.,  73.,  74.,  75.,  78.,  79.,  80.,
        81.,  83.,  84.,  85.,  88.,  90.,  92.,  93.,  96.,  97., 100.,
       101., 103., 110., 116., 120., 122., 128., 130., 133., 137., 140.,
       145.])

In [156]:
check = pd.DataFrame({'y': scenarios[:,16], 'n': 1}).groupby('y').agg({'n': sum}).reset_index()

In [157]:
check['percent'] = check['n']/K

In [158]:
check

Unnamed: 0,y,n,percent
0,0.0,25,0.0025
1,1.0,12,0.0012
2,2.0,26,0.0026
3,3.0,93,0.0093
4,4.0,192,0.0192
...,...,...,...
93,130.0,3,0.0003
94,133.0,6,0.0006
95,137.0,2,0.0002
96,140.0,6,0.0006


In [159]:
cost_params

[{'CR': 0.1, 'c_waiting_time': 1, 'c_overtime': 9},
 {'CR': 0.25, 'c_waiting_time': 1, 'c_overtime': 3},
 {'CR': 0.5, 'c_waiting_time': 1, 'c_overtime': 1},
 {'CR': 0.75, 'c_waiting_time': 3, 'c_overtime': 1},
 {'CR': 0.9, 'c_waiting_time': 9, 'c_overtime': 1}]

In [160]:
from DataDrivenPatientScheduling import PatientScheduling

In [161]:
cost_params_ = cost_params[2]

# Progress
if print_status:
    print('## Cost param setting:',cost_params_)

# Timer
opt_st_exec = time.time()
opt_st_cpu = time.process_time() 

# Optimization
CR, c_waiting_time, c_overtime = cost_params_['CR'], cost_params_['c_waiting_time'], cost_params_['c_overtime']

# Initialize optimization model
ddps = PatientScheduling(c_waitingtime=c_waiting_time, c_overtime=c_overtime, T=T, **gurobi_params)

# Set up optimization model with scenarios
ddps.create(z=scenarios)

# Solve
schedule, times, status, solutions, gap = ddps.optimize()

# Waiting time
scheduled_start, scheduled_duration, actual_duration, waiting_time = [], [], [], []    

for j in range(M):

    scheduled_start += [schedule[j]]
    scheduled_duration += [times[j]]
    actual_duration += list(y[j])
    waiting_time += [0] if j == 0 else [max([0]+[waiting_time[j-1]+actual_duration[j-1]-scheduled_duration[j-1]])]

# Overtime (using last j)
overtime = max([0]+[waiting_time[j]+actual_duration[j]-scheduled_duration[j]])

# Cost
cost = c_waiting_time * sum(waiting_time) + c_overtime * overtime

# Timer
optimization_exec_time_sec = time.time()-opt_st_exec
optimization_cpu_time_sec = time.process_time()-opt_st_cpu

# Store results
result = {

    'date': date,
    'area': area,
    'n_patients': M,
    'n_scenarios': K,
    'CR': CR,
    'c_waiting_time': c_waiting_time,
    'c_overtime': c_overtime,
    'historical_time_budget': hist_durations[area] * M,
    'time_budget_multiplier': alpha,
    'total_time_budget': T,
    'waiting_time': sum(waiting_time),
    'overtime': overtime,
    'cost': cost,
    'scheduled_total_durations': sum(scheduled_duration),
    'actual_total_durations': sum(actual_duration),
    'optimization_exec_time_sec': optimization_exec_time_sec,
    'optimization_cpu_time_sec': optimization_cpu_time_sec,
    'scenarios_exec_time_sec': scenarios_exec_time_sec,
    'scenarios_cpu_time_sec': scenarios_cpu_time_sec,
    'weights_exec_time_sec': weights_exec_time_sec,
    'weights_cpu_time_sec': weights_cpu_time_sec

}



In [162]:
#SAA
result

{'date': '2019-10-23',
 'area': 'Bereich_Gastroskopie',
 'n_patients': 17,
 'n_scenarios': 10000,
 'CR': 0.5,
 'c_waiting_time': 1,
 'c_overtime': 1,
 'historical_time_budget': 221.0,
 'time_budget_multiplier': 2,
 'total_time_budget': 442.0,
 'waiting_time': 205.0,
 'overtime': 40.0,
 'cost': 245.0,
 'scheduled_total_durations': 442.0,
 'actual_total_durations': 424.0,
 'optimization_exec_time_sec': 76.44097471237183,
 'optimization_cpu_time_sec': 147.447834037,
 'scenarios_exec_time_sec': 2.7560224533081055,
 'scenarios_cpu_time_sec': 2.7710023000000064,
 'weights_exec_time_sec': 0.00980687141418457,
 'weights_cpu_time_sec': 0.01024515699999995}

In [146]:
result

{'date': '2019-10-23',
 'area': 'Bereich_Gastroskopie',
 'n_patients': 17,
 'n_scenarios': 10000,
 'CR': 0.5,
 'c_waiting_time': 1,
 'c_overtime': 1,
 'historical_time_budget': 221.0,
 'time_budget_multiplier': 2,
 'total_time_budget': 442.0,
 'waiting_time': 326.0,
 'overtime': 41.0,
 'cost': 367.0,
 'scheduled_total_durations': 442.0,
 'actual_total_durations': 424.0,
 'optimization_exec_time_sec': 18.892744541168213,
 'optimization_cpu_time_sec': 32.028778798000005,
 'scenarios_exec_time_sec': 2.6017837524414062,
 'scenarios_cpu_time_sec': 2.6208765570000025,
 'weights_exec_time_sec': 0.001920461654663086,
 'weights_cpu_time_sec': 0.002570519999991916}

In [147]:
schedule

array([  0.,  21.,  49.,  69.,  84., 121., 140., 167., 187., 207., 238.,
       263., 281., 309., 339., 419., 442.])

In [148]:
times

[21.0,
 28.0,
 20.0,
 15.0,
 37.0,
 19.0,
 27.0,
 20.0,
 20.0,
 31.0,
 25.0,
 18.0,
 28.0,
 30.0,
 80.0,
 23.0,
 -0.0]

In [163]:
#SAA
schedule

array([  0.,  20.,  47.,  74., 102., 131., 161., 190., 218., 247., 276.,
       305., 335., 364., 392., 418., 442.])

In [164]:
# SAA
times

[20.0,
 27.0,
 27.0,
 28.0,
 29.0,
 30.0,
 29.0,
 28.0,
 29.0,
 29.0,
 29.0,
 30.0,
 29.0,
 28.0,
 26.0,
 24.0,
 -0.0]

In [78]:
T

276.25

In [96]:
sum(actual_duration)

424.0

In [12]:
# SAA
locals().update(optimization_params)

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

date = min(dates)

results_saa = experiment.run(
    X = np.array(X_data),
    y = np.array(Y_data),
    date = date,
    dates = ID_data['date'],
    areas = ID_data['area'],
    weightsModel = SampleAverageApproximation(),
    cost_params = cost_params,
    gurobi_params = gurobi_params,
    K = K,
    alpha = alpha,
    print_status = False)

In [16]:
pd.merge(
    left=results[['date', 'area', 'CR', 'n_patients', 'cost', 'overtime', 'waiting_time']],
    right=results_saa[['date', 'area', 'CR', 'n_patients', 'cost', 'overtime', 'waiting_time']],
    on=['date', 'area', 'CR'],
    suffixes=('', '_SAA')
)

Unnamed: 0,date,area,CR,n_patients,cost,overtime,waiting_time,n_patients_SAA,cost_SAA,overtime_SAA,waiting_time_SAA
0,2019-10-23,Bereich_Koloskopie,0.1,7,141.0,9.0,60.0,7,149.0,9.0,68.0
1,2019-10-23,Bereich_Koloskopie,0.25,7,78.0,9.0,51.0,7,80.0,9.0,53.0
2,2019-10-23,Bereich_Koloskopie,0.5,7,54.0,9.0,45.0,7,52.0,9.0,43.0
3,2019-10-23,Bereich_Koloskopie,0.75,7,129.0,9.0,40.0,7,123.0,9.0,38.0
4,2019-10-23,Bereich_Koloskopie,0.9,7,351.0,9.0,38.0,7,342.0,9.0,37.0
5,2019-10-23,Bereich_Endosonographie,0.1,4,451.0,38.0,109.0,4,449.0,38.0,107.0
6,2019-10-23,Bereich_Endosonographie,0.25,4,211.0,38.0,97.0,4,213.0,38.0,99.0
7,2019-10-23,Bereich_Endosonographie,0.5,4,131.0,38.0,93.0,4,132.0,38.0,94.0
8,2019-10-23,Bereich_Endosonographie,0.75,4,305.0,38.0,89.0,4,308.0,38.0,90.0
9,2019-10-23,Bereich_Endosonographie,0.9,4,839.0,38.0,89.0,4,821.0,38.0,87.0
