# Init

In [1]:
# Import utils
import numpy as np
import pandas as pd
import math
import time
import json
import pyreadr
import pickle
from joblib import dump, load, Parallel, delayed
import os
import copy
import datetime as dt
from tqdm import tqdm


# Import ML models
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from dddex.levelSetKDEx_univariate import LevelSetKDEx, LevelSetKDEx_NN
from dddex.wSAA import RandomForestWSAA, SampleAverageApproximation

# Weights & Biases
import wandb

# Import Gurobi
import gurobipy as gp
from gurobipy import GRB

# Data-Driven Patient Scheduling modules
from DataDrivenPatientScheduling.WeightsModel import WeightsModel
from DataDrivenPatientScheduling.Experiment import Experiment

In [2]:
# Setup directories and model names
directories_setup = dict(

    # Paths
    path_data = '/home/fesc/dddex/PatientScheduling/Data',
    path_models = '/home/fesc/dddex/PatientScheduling/Data/Models',
    path_results = '/home/fesc/dddex/PatientScheduling/Data/Results',
    
    # Models
    LSx_LGBM = 'LSx_LGBM',
    LSx_NN_LGBM = 'LSx_NN_LGBM',
    wSAA_RF = 'wSAA_RF',
    SAA_by_area = 'SAA_by_area',
    SAA = 'SAA'
)

# Make all experiment variables visible locally
locals().update(directories_setup)

# Model training

## Pre-processing

In [None]:
# Load data
y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

In [None]:
# Train-test split (for initial model training)
y_train = np.array(y_data.loc[ID_data.train_test == 'train']).flatten()
X_train = np.array(X_data.loc[ID_data.train_test == 'train'])
ID_train = ID_data.loc[ID_data.train_test == 'train']

In [None]:
# CV folds
tscv = TimeSeriesSplit(n_splits=3)
cv_folds = tscv.split(range(len(ID_train)))
cv_folds = list(cv_folds)

## (a) LSx - LGBM

In [None]:
# Setup the model
model_setup = dict(

    ## Point estimator
    point_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 4,
            'verbose': -1
        },

        # Tuning meta params
        tuning_params = {     
            'n_iter': 1000,
            'scoring': {'MSE': 'neg_mean_squared_error'},
            'return_train_score': True,
            'refit': 'MSE',
            'random_state': 12345,
            'n_jobs': 8,
            'verbose': 0
        },    

       # Hyper params search grid
       hyper_params_grid = {
            'num_leaves': [x for x in range(5, 500, 5)],
            'max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_child_samples': [x for x in range(5, 500, 5)],
            'learning_rate': [x/100 for x in range(1, 20+1, 1)],
            'n_estimators': [100, 200, 300, 400, 500],
            'subsample': [x/100 for x in range(5, 100+1, 5)],
            'colsample_bytree': [x/100 for x in range(5, 100, 5)]
        },  
    ),
          
 
    ## Density estimator
    density_estimator_params = dict(
    
        # Meta parameters
        model_params = {
            'weightsByDistance': False
        },

        # Tuning meta params
        tuning_params = {     
            'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'n_jobs': 8,
        },    

        # Hyper params search grid
        hyper_params_grid = {
            'binSize': [x for x in range(10, 500, 10)]
        },
        
    )
)

# Make all model variables visible locally
locals().update(model_setup)

# Initialize modules
weightsmodel = WeightsModel()

### Tune point estimator

In [None]:
# Update params
locals().update(point_estimator_params)

# Point estimator
point_estimator = LGBMRegressor(**model_params)

# Tune point estimator
point_estimator = weightsmodel.tune_point_estimator(X_train, y_train, point_estimator, cv_folds, 
                                                    hyper_params_grid, tuning_params, random_search=True, 
                                                    print_time=True)

### Tune density estimator

In [None]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = LevelSetKDEx(estimator = point_estimator, **model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, 
                                                        hyper_params_grid, tuning_params, random_search=False, 
                                                        print_time=True)

# Save
save = dump(density_estimator, path_models+'/density_estimator_'+LSx_LGBM+'.joblib')

## (b) LSx NN - LGBM

In [None]:
# Setup the model
model_setup = dict(
        
    ## Point estimator
    point_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 4,
            'verbose': -1
        },

        # Tuning meta params
        tuning_params = {     
            'n_iter': 1000,
            'scoring': {'MSE': 'neg_mean_squared_error'},
            'return_train_score': True,
            'refit': 'MSE',
            'random_state': 12345,
            'n_jobs': 8,
            'verbose': 0
        },    

       # Hyper params search grid
       hyper_params_grid = {
            'num_leaves': [x for x in range(5, 500, 5)],
            'max_depth': [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_child_samples': [x for x in range(5, 500, 5)],
            'learning_rate': [x/100 for x in range(1, 20+1, 1)],
            'n_estimators': [100, 200, 300, 400, 500],
            'subsample': [x/100 for x in range(5, 100+1, 5)],
            'colsample_bytree': [x/100 for x in range(5, 100, 5)]
        },  
    ),
          
 
    ## Density estimator
    density_estimator_params = dict(
    
        # Meta parameters
        model_params = {
            'weightsByDistance': True
        },

        # Tuning meta params
        tuning_params = {     
            'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'n_jobs': 8,
        },    

        # Hyper params search grid
        hyper_params_grid = {
            'binSize': [x for x in range(10, 500, 10)]
        },
        
    )
)

# Make all model variables visible locally
locals().update(model_setup)

# Initialize modules
weightsmodel = WeightsModel()

### Tune point estimator

In [None]:
# Update params
locals().update(point_estimator_params)

# Point estimator
point_estimator = LGBMRegressor(**model_params)

# Tune point estimator
point_estimator = weightsmodel.tune_point_estimator(X_train, y_train, point_estimator, cv_folds, hyper_params_grid, 
                                                    tuning_params, random_search=True, print_time=True)

### Tune density estimator

In [None]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = LevelSetKDEx_NN(estimator = point_estimator, **model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, hyper_params_grid, 
                                                        tuning_params, random_search=False, print_time=True)

# Save
save = dump(density_estimator, path_models+'/density_estimator_'+LSx_NN_LGBM+'.joblib')

## (c) wSAA - RF

In [None]:
# Setup the model
model_setup = dict(
        
    ## Density estimator
    density_estimator_params = dict(

        # Meta parameters
        model_params = {
            'random_state': 12345,
            'n_jobs': 8,
            'verbose': 0
        },

        # Tuning meta params
        tuning_params = {     
            'probs': [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995],
            'nIter': 1000,
            'random_state': 12345,
            'n_jobs': 8
        },    

       # Hyper params search grid
       hyper_params_grid = {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_samples_split': [x for x in range(5, 500, 5)],
            'min_samples_leaf': [x for x in range(5, 500, 5)],
            'max_features': [x for x in range(5, 100, 5)],
            'max_leaf_nodes': [x for x in range(5, 500, 5)],
            'min_impurity_decrease': [x/100 for x in range(1, 20+1, 1)],
            'bootstrap': [True],
            'max_samples': [x/100 for x in range(5, 100+1, 5)]           
        },          
    )
)

# Make all model variables visible locally
locals().update(model_setup)

# Initialize modules
weightsmodel = WeightsModel()

### Tune density estimator

In [None]:
# Update params
locals().update(density_estimator_params)

# Density estimator
density_estimator = RandomForestWSAA(**model_params)

# Tune density estimator
density_estimator = weightsmodel.tune_density_estimator(X_train, y_train, density_estimator, cv_folds, hyper_params_grid, 
                                                        tuning_params, random_search=True, print_time=True)

# Save
save = dump(density_estimator, path_models+'/density_estimator_'+wSAA_RF+'.joblib')

# Data-driven optimization

In [3]:
# Setup the experiment
optimization_params = dict(

    # Gurobi params
    gurobi_params = {

        'LogToConsole': 0, 
        'Threads': 2

    },

    # Cost params
    cost_params = [

        {'c_waiting_time': 1, 'c_overtime': 1},
        {'c_waiting_time': 1, 'c_overtime': 2},
        {'c_waiting_time': 1, 'c_overtime': 3}
        
    ],

    # Number of scenarios
    K = [10**2, 10**3, 10**4],

    # Time budget multiplier
    rho = [0.85, 1.00, 1.15],

    # n parallel jobs
    n_jobs = 32

)

# Make all experiment variables visible locally
locals().update(optimization_params)

In [4]:
# Type of run
#run_suffix = 'xArea'
#run_suffix = 'xCapacity'
run_suffix = 'xCapacity_xArea'

## Pre-processing

In [5]:
# Load data
y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

room_assignments = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/room_assignments_'+run_suffix+'.csv')

In [6]:
# Ensure room assignments exactly match date, area, patient_id (i.e., each case)
rooms = []
for date, area, patient_id in zip(ID_data.date, ID_data.area, ID_data.patient_id):
    
    sel = (room_assignments.date == date) & (room_assignments.area == area) & (room_assignments.patient_id == patient_id)
    
    rooms += [room_assignments.loc[sel].room.item()]
    
ID_data['room'] = copy.deepcopy(rooms)

## (a) LSx - LGBM

In [7]:
# Initialize modules
experiment = Experiment()

# Load density estimator
density_estimator = load(path_models+'/density_estimator_'+LSx_LGBM+'.joblib')

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        rooms = ID_data['room'],
        weightsModel = density_estimator,
        K = K,
        rho = rho,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+'/'+LSx_LGBM+'_'+run_suffix+'.csv', sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

Progress: 100%|██████████| 64/64 [1:34:15<00:00, 88.36s/it] 


Time: 1:34:15
>> Execution time: 5655.0 seconds
>> CPU time: 4.0 seconds


## (b) LSx NN - LGBM

In [8]:
# Initialize modules
experiment = Experiment()

# Load density estimator
density_estimator = load(path_models+'/density_estimator_'+LSx_NN_LGBM+'.joblib')

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        rooms = ID_data['room'],
        weightsModel = density_estimator,
        K = K,
        rho = rho,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+'/'+LSx_NN_LGBM+'_'+run_suffix+'.csv', sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

Progress: 100%|██████████| 64/64 [1:45:05<00:00, 98.52s/it]   


Time: 1:45:06
>> Execution time: 6306.0 seconds
>> CPU time: 8.0 seconds


## (c) wSAA RF

In [9]:
# Initialize modules
experiment = Experiment()

# Load density estimator
density_estimator = load(path_models+'/density_estimator_'+wSAA_RF+'.joblib')

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        rooms = ID_data['room'],
        weightsModel = density_estimator,
        K = K,
        rho = rho,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+'/'+wSAA_RF+'_'+run_suffix+'.csv', sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

Progress: 100%|██████████| 64/64 [1:44:03<00:00, 97.55s/it]  


Time: 1:44:03
>> Execution time: 6243.0 seconds
>> CPU time: 7.0 seconds


## (d) SAA by treatment area

In [10]:
# Initialize modules
experiment = Experiment()

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        rooms = ID_data['room'],
        weightsModel = SampleAverageApproximation(),
        K = K,
        rho = rho,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+'/'+SAA_by_area+'_'+run_suffix+'.csv', sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

Progress: 100%|██████████| 64/64 [1:43:32<00:00, 97.07s/it]  


Time: 1:43:33
>> Execution time: 6213.0 seconds
>> CPU time: 3.0 seconds


## (e) SAA

In [11]:
# Initialize modules
experiment = Experiment()

# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

# Timer
start_time, st_exec, st_cpu = dt.datetime.now().replace(microsecond=0), time.time(), time.process_time()      
        
# For each date in the test horizon
with experiment.tqdm_joblib(tqdm(desc='Progress', total=len(dates))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(experiment.run)(
        X = np.array(X_data),
        y = np.array(y_data),
        date = date,
        dates = ID_data['date'],
        areas = ID_data['area'],
        rooms = ID_data['room'],
        weightsModel = None,
        K = K,
        rho = rho,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        print_status = False) for date in dates)
    
# Finalize
results = pd.concat(results).reset_index(drop=True)

# Save results
results.to_csv(path_results+'/'+SAA+'_'+run_suffix+'.csv', sep=',', index=False)

# Time
print('Time:', dt.datetime.now().replace(microsecond=0) - start_time)  
print('>> Execution time:', np.around(time.time()-st_exec, 0), "seconds") 
print('>> CPU time:', np.around(time.process_time()-st_cpu, 0), "seconds")

Progress: 100%|██████████| 64/64 [1:17:46<00:00, 72.91s/it] 


Time: 1:17:47
>> Execution time: 4667.0 seconds
>> CPU time: 2.0 seconds
