# Init

In [34]:
# Import utils
import numpy as np
import pandas as pd
import math
import time
import json
import pyreadr
import pickle
from joblib import dump, load
import os
import copy
import datetime as dt


# Import ML models
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from dddex.levelSetKDEx_univariate import LevelSetKDEx, LevelSetKDEx_NN
from dddex.wSAA import RandomForestWSAA, SampleAverageApproximation
from dddex.crossValidation import QuantileCrossValidation, QuantileCrossValidationLSx, groupedTimeSeriesSplit
from dddex.utils import generateFinalOutput

# Weights & Biases
import wandb

# Import Gurobi
import gurobipy as gp
from gurobipy import GRB

# Optimization Module
from DataDrivenPatientScheduling import PatientScheduling
from DataDrivenPatientScheduling import Experiment

In [35]:
# Setup the experiment
experiment_setup = dict(

    # Set paths
    path_data = '/home/fesc/dddex/PatientScheduling/Data',
    path_results = '/home/fesc/dddex/PatientScheduling/Data/Results',

    # Set gurobi params
    gurobi_params = {
    
        'LogToConsole': 0, 
        'Threads': 1
        
    },
    
    # Meta parameters
    model_params = {
        'random_state': 12345,
        'n_jobs': 4,
        'verbose': -1
    },

    # Hyper params search grid
    hyper_params_grid = {
        'num_leaves': [10, 20, 40, 80, 160, 320],
        'max_depth': [-1, 3, 4, 5, 6, 7, 8, 9, 10],
        'min_child_samples': [10, 20, 40, 80, 160, 320],
        'learning_rate': [x/100 for x in range(5, 25, 5)],
        'n_estimators': [100, 200, 300, 400, 500],
        'subsample': [x/100 for x in range(5, 100, 5)],
        'colsample_bytree': [x/100 for x in range(5, 100, 5)]
    },    
    
    

    # Tuning params
    tuning_params = {     
        'n_iter': 100,
        'scoring': {'MSE': 'neg_mean_squared_error'},
        'return_train_score': True,
        'refit': 'MSE',
        'random_state': 12345,
        'n_jobs': 8,
        'verbose': 0
    },    

    # Bin size grid
    binSizeGrid = {
        'binSize': [x for x in range(100, 1000, 100)]
    },
    
    # Tuning using random search
    random_search = True,
    
    # Status printing
    print_status = True,
    
    # Target quantiles
    quantiles = np.array([0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.900, 0.975, 0.995])

)

# Make all experiment variables visible locally
locals().update(experiment_setup)

# Preprocessing

In [36]:
# Load data
Y_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/Y_data.csv')
X_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/X_data.csv')
ID_data = pd.read_csv('/home/fesc/dddex/PatientScheduling/Data/ID_data.csv')

# Tuning point estimator

In [37]:
# Train-test split
y_train = np.array(Y_data.loc[ID_data.train_test == 'train']).flatten()
X_train = np.array(X_data.loc[ID_data.train_test == 'train'])
ID_train = ID_data.loc[ID_data.train_test == 'train']

In [38]:
# CV folds
tscv = TimeSeriesSplit(n_splits=3)
cv_folds = tscv.split(range(len(ID_train)))
cv_folds = list(cv_folds)

In [39]:
# Regressor
estimator = LGBMRegressor(**model_params)

In [40]:
# Timer
start_time = dt.datetime.now().replace(microsecond=0)
st_exec = time.time()
st_cpu = time.process_time() 

# Hyper params grid: ensure max features is not too large
hyper_params_grid['colsample_bytree'] = [
    colsample_bytree 
    for colsample_bytree 
    in hyper_params_grid.get('colsample_bytree', [0]) 
    if colsample_bytree <= X_train.shape[1]
]

# Base model
estimator = LGBMRegressor(**model_params)  

# Tuning approach
if random_search:

    # Random search CV
    cv_search = RandomizedSearchCV(estimator=estimator,
                                   cv=cv_folds,
                                   param_distributions=hyper_params_grid,
                                   **tuning_params)

else:

    # Grid search SV
    cv_search = GridSearchCV(estimator=estimator,
                             cv=cv_folds,
                             param_grid=hyper_params_grid,
                             **tuning_params)

# Fit the cv search
cv_search.fit(X_train, y_train) 

# Status
if print_status: 
    print('... took', dt.datetime.now().replace(microsecond=0) - start_time)      

# Timer
exec_time_sec = time.time()-st_exec
cpu_time_sec = time.process_time()-st_cpu

... took 0:00:05


In [41]:
# CV results
best_hyper_params = cv_search.best_params_

# Tuning density estimator

In [42]:
# LSx model based on tuned point predictor
LSKDEx = LevelSetKDEx(estimator = LGBMRegressor(**model_params, **best_hyper_params),
                      weightsByDistance = False)

# Set up bin size tuning
LSxCV = QuantileCrossValidation(estimator = LSKDEx, 
                                cvFolds = cv_folds,
                                parameterGrid = binSizeGrid,
                                probs = quantiles,
                                n_jobs = 8)

# Tune bin-size
LSxCV.fit(X_train, y_train)

# Data-driven optimization

For each patient $j=1,...,M$ for the current day $t$, we have $N_t$ historical samples. Given patient $j$ and the associated feature vector $\boldsymbol{x}_{j,t}$, we have $N_t$ weights for these samples. With this, we have an approximation of the empirical probability distribution of surgery duration. We use the approximated distribution to draw $K$ scenarios for each patient $j=1,...,M$.

In [43]:
# Model
model_name = 'LSx_LGBM'

In [44]:
# Test dates
dates = pd.Series(list(set(ID_data.loc[ID_data.train_test == 'test', 'date']))).sort_values()

In [45]:
# Initialize Experiment
exp = Experiment()

In [46]:
cost_params = [
    
    {'CR': 0.10, 'c_waiting_time': 1, 'c_overtime': 9},
    {'CR': 0.25, 'c_waiting_time': 1, 'c_overtime': 3},
    {'CR': 0.50, 'c_waiting_time': 1, 'c_overtime': 1},
    {'CR': 0.75, 'c_waiting_time': 3, 'c_overtime': 1},
    {'CR': 0.90, 'c_waiting_time': 9, 'c_overtime': 1}

]

K = 10**4

gurobi_params = {'LogToConsole': 0, 'Threads': 32}

In [47]:
# Initialize
results = pd.DataFrame()

# Timer
start_time = dt.datetime.now().replace(microsecond=0)
counter = 0

# Progress
print("Progress: "+str(int(np.around(counter / len(dates), 2) * 100))+"% ("+str(counter)+"/"+str(len(dates))+" days) |",
      "Current time: "+str(dt.datetime.now().replace(microsecond=0) - start_time), end='\r')
    
# For each date in the test horizon
for date in dates: 
        
    # Progress
    counter += 1
                
    # Train-test split
    y_train, y_test = np.array(Y_data.loc[ID_data['date'] < date]).flatten(), np.array(Y_data.loc[ID_data['date'] == date]).flatten()
    X_train, X_test = np.array(X_data.loc[ID_data['date'] < date]), np.array(X_data.loc[ID_data['date'] == date])
    ID_train, ID_test = ID_data.loc[ID_data['date'] < date], ID_data.loc[ID_data['date'] == date]
    
    # Initialize weights model (best estimator from hyper param tuning)
    LSKDEx = LSxCV.bestEstimator

    # Fit weights model
    LSKDEx.fit(X_train, y_train)

    # Get time budget per area
    historicalDurations = pd.DataFrame({
        'duration': y_train, 
        'treatment': ID_train['treatment']
    }).groupby('treatment').agg(
        median_duration=('duration', np.median)
    ).reset_index().rename(columns={'treatment': 'area'})

    historicalDurations = dict(zip(historicalDurations.area, historicalDurations.median_duration))

    # Data-driven optimization
    result = exp.run_ddps(
        weightsModel = LSKDEx, 
        X_test = X_test, 
        y_test = y_test, 
        date = date, 
        dates = ID_test['date'], 
        areas = ID_test['treatment'], 
        hist_durations = historicalDurations,
        cost_params = cost_params,
        gurobi_params = gurobi_params,
        K = K,
        print_status = False
    )
    
    # Add result
    results = pd.concat([results, result])
    
    # Progress
    print("Progress: "+str(int(np.around(counter / len(dates), 2) * 100))+"% ("+str(counter)+"/"+str(len(dates))+" days) |",
          "Current time: "+str(dt.datetime.now().replace(microsecond=0) - start_time), end='\r')
    
# Finalize
results = results.reset_index(drop=True)

Progress: 100% (63/63 days) | Current time: 7:00:36

In [48]:
## Save results
results.to_csv(path_results+"/"+model_name+"_K"+str(K)+".csv", sep=',', index=False)