# Packages

In [1]:
import pandas as pd
import numpy as np
import itertools

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from dddex.levelSetKDEx_univariate import LevelSetKDEx, LevelSetKDEx_NN
from dddex.wSAA import RandomForestWSAA, SampleAverageApproximation
from dddex.crossValidation import QuantileCrossValidation, QuantileCrossValidationLSx, groupedTimeSeriesSplit
from dddex.utils import generateFinalOutput
# from generalFuncs.runModels import tunePredictSave, getCostsNV
# import generalFuncs

import wandb
import os
import re
import ipdb
import pickle
import shutil

KeyboardInterrupt: 

# Functions

## Main Function - Tune Predict Save

NOTE: decisionArtifact and costArtifact must have been logged already before calling `tunePredictSave`. This is required because of the current
way `updateArtifact` is implemented.

In [None]:
def tunePredictSave(quantileEstimator,
                    combinedCV,
                    model,
                    estimatorName):
    
    runName = model + "_" + estimatorName
    runName = runName + "_" + "combinedCV" if combinedCV else runName
    runName_SLTuning = runName + "_SLTuning"
    
    config = {'isModellingRun': True,
              'model': model,
              'estimator': estimatorName,
              'weightsByDistance': weightsByDistance,
              'combinedCV': combinedCV,
              'SLTuning': False,
              'kFolds': kFolds,
              'nIter': nIter}
    
    run = wandb.init(project = project, name = runName, job_type = "modelling", config = config)
    
    #---
    
    if combinedCV:
        
        if estimatorName == 'LGBM':
            paramGridEstimator = paramGridLGBM
        else:
            paramGridEstimator = paramGridRF
            
        LSxCV = QuantileCrossValidationLSx(estimatorLSx = quantileEstimator, 
                                           cvFolds = cvFolds,
                                           parameterGridLSx = binSizeGrid,
                                           parameterGridEstimator = paramGridEstimator,
                                           randomSearchEstimator = True,
                                           nIterEstimator = nIter,
                                           probs = probs,
                                           refitPerProb = True,
                                           n_jobs = 1)
        
        paramsEstimatorForOutput = {}
        
    else:
        
        if model == 'WSAA':
            paramGrid = paramGridRF
            randomSearch = True
            paramsEstimatorForOutput = {}
        else:
            paramGrid = binSizeGrid
            randomSearch = False
            
            if estimatorName == 'LGBM':
                paramsEstimatorForOutput = paramsLGBM
            else:
                paramsEstimatorForOutput = paramsRF
            
        LSxCV = QuantileCrossValidation(estimator = quantileEstimator, 
                                        cvFolds = cvFolds,
                                        parameterGrid = paramGrid,
                                        randomSearch = randomSearch,
                                        nIter = nIter,
                                        probs = probs,
                                        refitPerProb = True,
                                        n_jobs = 1)

    LSxCV.fit(X = XTrain, 
              y = yTrain)
    
    #---
    
    # Results without Service-Level-Tuning
    quantilesDf = LSxCV.bestEstimator.predict(X = XTest, 
                                              probs = probs, 
                                              outputAsDf = True, 
                                              scalingList = scalingList)

    quantilesDf.columns = colnamesQuantile

    resDf = generateFinalOutput(dataOriginal = data, 
                                dataDecisions = quantilesDf, 
                                targetVariable = 'demand', 
                                mergeOn = None, 
                                variablesToAdd = ['dayIndex', 'date', 'scalingValue'], 
                                scaleBy = 'scalingValue', 
                                includeTraining = False, 
                                sortBy = ['id', 'dayIndex'],
                                longFormat = True,
                                **paramsEstimatorForOutput,
                                **LSxCV.bestParams)
    
    costsPerID, costsPerSL = getCostsNV(resDf = resDf,
                                        costsPerID_SAA = costsPerID_SAA)
    
    wandb.log(costsPerSL)
    wandb.finish()
    
    updateArtifact(name = 'decisionData',
                   dataToAdd = resDf,
                   fileNameToAdd = runName)
    
    updateArtifact(name = 'costData',
                   dataToAdd = costsPerID,
                   fileNameToAdd = runName) 
    
    #---
    
    # Results with Service-Level-Tuning
    config['SLTuning'] = True
    run = wandb.init(project = project, name = runName_SLTuning, job_type = "modelling", config = config)
    
    resList = list()

    for prob in probs:   

        quantilesDf = LSxCV.bestEstimator_perProb[prob].predict(X = XTest, 
                                                                probs = prob, 
                                                                outputAsDf = True, 
                                                                scalingList = scalingList)

        quantilesDf.columns = ["quantile_" + str(int(prob * 1000))]

        resDf_prob = generateFinalOutput(dataOriginal = data,
                                         dataDecisions = quantilesDf,
                                         targetVariable = 'demand',
                                         mergeOn = None,
                                         variablesToAdd = ['dayIndex', 'date', 'scalingValue'],
                                         scaleBy = 'scalingValue',
                                         includeTraining = False,
                                         longFormat = True,
                                         **paramsEstimatorForOutput,
                                         **LSxCV.bestParams_perProb[prob])

        resList.append(resDf_prob)

    resDf_SLTuning = pd.concat(resList, axis = 0)
    
    costsPerID_SLTuning, costsPerSL_SLTuning = getCostsNV(resDf = resDf_SLTuning,
                                                          costsPerID_SAA = costsPerID_SAA)
    
    wandb.log(costsPerSL_SLTuning)
    wandb.finish()
    
    updateArtifact(name = 'decisionData',
                   dataToAdd = resDf_SLTuning,
                   fileNameToAdd = runName_SLTuning)
    
    updateArtifact(name = 'costData',
                   dataToAdd = costsPerID_SLTuning,
                   fileNameToAdd = runName_SLTuning)  
    
    # return resDf, costsPerID, costsPerSL

## Update Artifact

In [None]:
def updateArtifact(name,
                   dataToAdd,
                   fileNameToAdd,
                   alias = 'latest',
                   ):

    run = wandb.init(project = project, name = 'updateArtifacts', job_type = 'updateArtifact', config = {'isModellingRun': False})
    
    artifactOld = run.use_artifact(name + ':' + alias)
    artifactFolder = artifactOld.download()

    fileNames = os.listdir(artifactFolder)

    artifactNew = wandb.Artifact(name = name,
                                 type = artifactOld.type,
                                 description = artifactOld.description,
                                 metadata = artifactOld.metadata)

    for fileOld in fileNames:
        path = os.path.join(artifactFolder, fileOld)

        with open(path, 'rb') as file:
            dataOld = pd.read_pickle(file)

        with artifactNew.new_file(fileOld, mode = "wb") as file:
            dataOld.to_pickle(file)

    with artifactNew.new_file(fileNameToAdd + '.pkl', mode = "wb") as file:
            dataToAdd.to_pickle(file)

    wandb.log_artifact(artifactNew)
    
    wandb.finish()
    
    shutil.rmtree(artifactFolder)

## Load Current Data To Continue after Break

In [None]:
def loadDataToContinue():
    
    global costsPerID_SAA
    global estimatorLGBM
    global estimatorRF
    global paramsLGBM
    global paramsRF
    global data
    global XTrain
    global yTrain
    global XTest
    global yTest
    global scalingList
    global nIter
    global kFolds
    global probs
    global colnamesQuantile
    global cvFolds
    global binSizeGrid
    global paramGridLGBM
    global paramGridRF
    global paramGridRF_LGBM
    
    #---
    
    run = wandb.init(project = project, name = "loadDataToContinue", job_type = "loadData", config = {'isModellingRun': False})
    
    #---

    artifactPredictorData = run.use_artifact('predictorData:latest')
    predictorDataFolder = artifactPredictorData.download()

    pathParamsLGBM = os.path.join(predictorDataFolder, "paramsLGBM.pkl")
    with open(pathParamsLGBM, 'rb') as file:
        paramsLGBM = pickle.load(file)
        
    pathParamsRF = os.path.join(predictorDataFolder, "paramsRF.pkl")
    with open(pathParamsRF, 'rb') as file:
        paramsRF = pickle.load(file)

    pathEstimatorLGBM = os.path.join(predictorDataFolder, "LGBM.pkl")
    with open(pathEstimatorLGBM, 'rb') as file:
        estimatorLGBM = pickle.load(file)

    pathEstimatorRF = os.path.join(predictorDataFolder, "RF.pkl")
    with open(pathEstimatorRF, 'rb') as file:
        estimatorRF = pickle.load(file)
        
    shutil.rmtree(predictorDataFolder)

    #---

    artifactCosts = run.use_artifact('costData:latest')
    costsDataFolder = artifactCosts.download()

    pathCostsSAA = os.path.join(costsDataFolder, "SAA.pkl")
    costsPerID_SAA = pd.read_pickle(pathCostsSAA)
    
    shutil.rmtree(costsDataFolder)
    
    #---
    
    artifactModelData = run.use_artifact('modelData:latest')
    modelDataFolder = artifactModelData.download()
    
    pathData = os.path.join(modelDataFolder, "data.pkl")
    data = np.load(pathData, allow_pickle = True)
    scalingList = data['scalingValue'][data['label'] == 'test'].to_list()
    
    filesToLoad = ["XTrain", "yTrain", "XTest", "yTest"]
    
    fileDictModelData = {}
    for fileName in filesToLoad:
        pathFile = os.path.join(modelDataFolder, fileName + ".pkl")
        fileDictModelData[fileName] = np.load(pathFile, allow_pickle = True)
    
    XTrain = fileDictModelData["XTrain"]
    yTrain = fileDictModelData["yTrain"]
    XTest = fileDictModelData["XTest"]
    yTest = fileDictModelData["yTest"]
    
    shutil.rmtree(modelDataFolder)
    
    #---
    
    artifactCVData = run.use_artifact('cvData:latest')
    cvDataFolder = artifactCVData.download()
    
    filesToLoad = ["cvHyperParameters", "probs", "colnamesQuantile", "cvFolds", "binSizeGrid", "paramGridLGBM", "paramGridRF", "paramGridRF_LGBM"]
    
    fileDictCV = {}
    for fileName in filesToLoad:
        pathFile = os.path.join(cvDataFolder, fileName + ".pkl")
        
        with open(pathFile, 'rb') as file:
            fileDictCV[fileName] = pickle.load(file)        
    
    nIter = fileDictCV["cvHyperParameters"]["nIter"]
    kFolds = fileDictCV["cvHyperParameters"]["kFolds"]
    probs = fileDictCV["probs"]
    colnamesQuantile = fileDictCV["colnamesQuantile"]
    cvFolds = fileDictCV["cvFolds"]
    binSizeGrid = fileDictCV["binSizeGrid"]
    paramGridLGBM = fileDictCV["paramGridLGBM"]
    paramGridRF = fileDictCV["paramGridRF"]
    paramGridRF_LGBM = fileDictCV["paramGridRF_LGBM"]
    
    shutil.rmtree(cvDataFolder)
    

## Get Newsvendor Costs

In [None]:
def getCostsNV(resDf,
               costsPerID_SAA = None):
    
    serviceLevels = np.array([int(re.findall('[0-9]+', decisionType)[0]) / 1000 for decisionType in resDf['decisionType']])
    # serviceLevels = resDf['decisionType'] * 10
    
    errors = resDf['actuals'] - resDf['decisions']
    resDf['costs'] = np.where(errors >= 0, errors * serviceLevels, np.abs(errors) * (1 - serviceLevels))

    costsPerID = resDf.groupby(['id', 'decisionType'], sort = False)['costs'].sum()
    
    if not costsPerID_SAA is None:
        costsPerID = costsPerID / costsPerID_SAA
    
    costsPerSL = costsPerID.reset_index().groupby(['decisionType'], sort = False)['costs'].mean()
    costsPerSL = costsPerSL.to_dict()
    
    return costsPerID, costsPerSL

# Setup Modelling

## Weights & Biases

### Environment Settings

In [None]:
os.environ['WANDB_NOTEBOOK_NAME'] = 'run_SID.ipynb'
os.environ['WANDB_SILENT'] = 'True'

### Project Name

In [None]:
project = 'modellingSID'

### Load and Save Model Data

In [None]:
config = {'isModellingRun': False}

wandb.init(project = project, name = 'loadModelData', job_type = 'loadData', config = config)

In [None]:
path = '/home/kagu/SID/data/dataSID.csv'
data = pd.read_csv(path)

# ids = data.id.unique()[0:2]
# filtering = [ID in ids for ID in data.id]
# data = data[filtering]

X = np.array(data.drop(['demand', 'date', 'id', 'label'], axis = 1))
Y = np.array(data['demand'])

indicesTrain = data['label'] == 'train'
indicesTest = data['label'] == 'test'

XTrain = X[indicesTrain]
yTrain = Y[indicesTrain]

XTest = X[indicesTest]
yTest = Y[indicesTest]

dataTrain = data[indicesTrain]
dataTest = data[indicesTest]

scalingList = dataTest['scalingValue'].tolist()

In [None]:
testLength = sum(data[data.id == data.id[0]].label == 'test')
numberOfIDs = len(data['id'].unique())

In [None]:
modelData = wandb.Artifact(name = "modelData", 
                           type = "dataset",
                           description = "Train/Test data used for modelling",
                           metadata = {"testLength": testLength,
                                       "numberOfIDs": numberOfIDs})

with modelData.new_file("data.pkl", mode = "wb") as file:
    data.to_pickle(file)

datasets = [XTrain, yTrain, XTest, yTest]
names = ["XTrain", "yTrain", "XTest", "yTest"]

for name, dataset in zip(names, datasets):
    with modelData.new_file(name + ".pkl", mode = "wb") as file:
        np.save(file, dataset)      
    
wandb.log_artifact(modelData)

### Set and Save Cross Validation Data

In [None]:
config = {'isModellingRun': False}

wandb.init(project = project, name = 'crossValidationData', job_type = 'setCVData', config = config)

#### Probs of Interest

In [None]:
probs = np.concatenate([np.array([0.005, 0.025, 0.165, 0.835, 0.975, 0.995]), np.arange(1, 100, 1) / 100])
probs = np.sort(probs)
probs

In [None]:
probsPermille = np.around(probs * 1000, decimals = 0)
probsName = [str(int(i)) for i in iter(probsPermille)]

colnamesQuantile = ['quantile_{}'.format(i) for i in iter(probsName)]

#### CV Hyperparameters

In [None]:
nIter = 120
kFolds = 4

cvHyperParameters = {"nIter": nIter,
                     "kFolds": kFolds}

#### Cross Validation Folds

In [None]:
# Time Series Split
cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = testLength, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

#### Param Grids

In [None]:
paramGridLGBM = {'num_leaves': [10, 25, 40, 60, 80, 100, 150, 200, 250, 300],
                 'max_depth': [-1, 3, 4, 5, 6, 7],
                 'min_child_samples': [10, 30, 50, 75, 100, 150, 250, 400, 600, 800, 1000, 1500, 2000, 3000, 5000, 10000],
                 'learning_rate': [0.05, 0.1, 0.15, 0.2],
                 'n_estimators': [100, 200, 300, 400, 500, 600],
                 'subsample': [0.05, 0.1, 0.2, 0.3, 0.5, 0.75, 1],
                 'colsample_bytree': [0.05, 0.1, 0.2, 0.3, 0.5, 0.75, 1]}

paramGridRF_LGBM = {'max_depth': [-1, 2, 3, 4, 5, 6, 7],
                    'min_child_samples': [10, 30, 50, 75, 100, 150, 250, 400, 600, 800, 1000, 1500, 2000, 3000, 5000, 10000],
                    'n_estimators': [100, 150, 200, 250, 300, 400, 500, 600],
                    'subsample': [0.1, 0.3, 0.5, 0.75],
                    'subsample_freq' : [1],
                    'colsample_bytree': [0.1, 0.3, 0.5, 0.75, 1]}

paramGridRF = [{'max_depth': [2, 3, 4, 5, 6, 7, 8, 10],
               'min_samples_leaf': [10, 30, 50, 75, 100, 150, 250, 400, 600, 800, 1000, 1500, 2000, 3000, 5000, 10000],
               'max_features': [0.1, 0.3, 0.5, 0.75, 1],
               'n_estimators': [100, 150, 200, 250, 300, 400, 500, 600],
               'max_samples': [0.1, 0.3, 0.5, 0.75, 1],
               'bootstrap' : [True]},
               {'max_depth': [2, 3, 4, 5, 6, 7, 8, 10],
               'min_samples_leaf': [10, 30, 50, 75, 100, 150, 250, 400, 600, 800, 1000, 1500, 2000, 3000, 5000, 10000],
               'max_features': [0.1, 0.3, 0.5, 0.75, 1],
               'n_estimators': [100, 150, 200, 250, 300, 400, 500, 600],
               'max_samples': [None],
               'bootstrap' : [False]}]
               

#---

binSizeGrid = {'binSize': [200, 300, 400, 500, 750, 1000, 
                           1500, 2000, 2500, 3000, 
                           4000, 5000, 6000]
                           # 8000, 10000]}

# binSizeGrid = {'binSize': [200, 750, 1000]}

In [None]:
cvData = wandb.Artifact(name = "cvData", 
                        type = "model",
                        description = "Cross-Validation Data - Contains Param Grids and Folds")

grids = [probs, colnamesQuantile, cvHyperParameters, cvFolds, paramGridLGBM, paramGridRF_LGBM, paramGridRF, binSizeGrid]
names = ["probs", "colnamesQuantile", "cvHyperParameters", "cvFolds", "paramGridLGBM", "paramGridRF_LGBM", "paramGridRF", "binSizeGrid"]

for name, grid in zip(names, grids):
    with cvData.new_file(name + ".pkl", mode = "wb") as file:
        pickle.dump(grid, file)
        
wandb.log_artifact(cvData)

### Additional Artifacts Data

In [None]:
decisionArtifact = wandb.Artifact(name = "decisionData", 
                                  type = "decisions",
                                  description = "Test decisions for all models",
                                  metadata = {"testLength": testLength,
                                              "numberOfIDs": numberOfIDs})

costArtifact = wandb.Artifact(name = "costData", 
                              type = "costs",
                              description = "Costs per id for all models",
                              metadata = {"testLength": testLength,
                                          "numberOfIDs": numberOfIDs})

# Tune Point Predictors

## LGBM

In [None]:
config = {'isModellingRun': False,
          'estimator': 'LGBM',
          'kFolds': kFolds,
          'nIter': nIter}

wandb.init(project = project, name = "tuneLGBM", job_type = "tuneModels", config = config)

In [None]:
# Regressor
estimator = LGBMRegressor(n_jobs = 1)

# Cross Validation
paramSearch = RandomizedSearchCV(estimator = estimator,
                                 cv = cvFolds,
                                 n_iter = nIter,
                                 param_distributions = paramGridLGBM,
                                 scoring = 'neg_mean_squared_error',
                                 refit = True,
                                 return_train_score = True,
                                 n_jobs = 10,
                                 random_state = 4444,
                                 verbose = 0)

paramSearch.fit(X = XTrain,
                y = yTrain)

estimatorLGBM = paramSearch.best_estimator_
paramsLGBM = paramSearch.best_params_
        
wandb.finish()

## RF

In [None]:
config = {'isModellingRun': False,
          'estimator': 'RF',
          'kFolds': kFolds,
          'nIter': nIter}

wandb.init(project = project, name = "tuneRF", job_type = "tuneModels", config = config)

In [None]:
# Regressor
estimator = RandomForestRegressor(n_jobs = 1)

# Cross Validation
paramSearch = RandomizedSearchCV(estimator = estimator,
                                 cv = cvFolds,
                                 n_iter = nIter,
                                 param_distributions = paramGridRF,
                                 scoring = 'neg_mean_squared_error',
                                 refit = True,
                                 return_train_score = True,
                                 n_jobs = 10,
                                 random_state = 4444,
                                 verbose = 0)

paramSearch.fit(X = XTrain,
                y = yTrain)

estimatorRF = paramSearch.best_estimator_
paramsRF = paramSearch.best_params_

wandb.finish()

## Save Models + Params

In [None]:
config = {'isModellingRun': False,
          'kFolds': kFolds,
          'nIter': nIter}

wandb.init(project = project, name = 'saveTunedPredictors', job_type = 'saveData', config = config)

predictorData = wandb.Artifact(name = "predictorData", 
                               type = "model",
                               description = "Tuned and fitted point predictor models and their parameters.")

with predictorData.new_file("LGBM.pkl", mode = "wb") as file:
    pickle.dump(estimatorLGBM, file)
    
with predictorData.new_file("RF.pkl", mode = "wb") as file:
    pickle.dump(estimatorRF, file)
    
with predictorData.new_file("paramsLGBM.pkl", mode = "wb") as file:
    pickle.dump(paramsLGBM, file)
        
with predictorData.new_file("paramsRF.pkl", mode = "wb") as file:
    pickle.dump(paramsRF, file)
    
wandb.log_artifact(predictorData)

wandb.finish()

# Run Models

## SAA 

In [None]:
config = {'isModellingRun': True,
          'model': 'SAA',
          'kFolds': kFolds,
          'nIter': nIter}

wandb.init(project = project, name = "SAA", job_type = "modelling", config = config)

In [None]:
IDs = data['id'].unique()

dataResultsList = list()

for ID in IDs:
    data_id = data[data['id'] == ID]
    
    y_id = np.array(data_id['demand'])
    X_id = np.array(data_id.drop(['demand', 'id', 'label'], axis = 1))
    
    indicesTrain_id = data_id['label'] == 'train'
    indicesTest_id = data_id['label'] == 'test'
    
    yTrain_id = y_id[indicesTrain_id]
    XTrain_id = X_id[indicesTrain_id]

    yTest_id = y_id[indicesTest_id]
    XTest_id = X_id[indicesTest_id]
    
    scalingList_id = data_id[indicesTest_id]['scalingValue'].tolist()
    
    #---
    
    SAA_id = SampleAverageApproximation()
    SAA_id.fit(y = yTrain_id)
    
    quantilesDfOneOb = SAA_id.predict(X = None, probs = probs, outputAsDf = True, scalingList = scalingList_id)
    quantilesDf_id = pd.concat([quantilesDfOneOb] * XTest_id.shape[0], axis = 0).reset_index(drop = True)
    quantilesDf_id.columns = colnamesQuantile
    
    resDf_id = generateFinalOutput(dataOriginal = data_id, 
                                   dataDecisions = quantilesDf_id, 
                                   targetVariable = 'demand', 
                                   mergeOn = None, 
                                   variablesToAdd = ['dayIndex', 'date', 'scalingValue'], 
                                   scaleBy = 'scalingValue', 
                                   includeTraining = False, 
                                   sortBy = ['id', 'dayIndex'],
                                   longFormat = True)
     
    dataResultsList.append(resDf_id)
    
#---

resDf = pd.concat(dataResultsList, axis = 0).reset_index(drop = True)
                                          

In [None]:
costsPerID_SAA, costsPerSL_SAA = getCostsNV(resDf = resDf)

# We need to reuse costsPerSL_SAA later on to compute the other cost ratios
costsPerSL_SAA = pd.Series(costsPerSL_SAA)

costsRatioPerSL_SAA = {decisionType: 1 for decisionType in costsPerSL_SAA.index}

In [None]:
# wandb.log({'costsPerSL': wandb.Table(data = pd.DataFrame([costsRatioPerSL_SAA]))})
wandb.log(costsRatioPerSL_SAA)

with decisionArtifact.new_file("SAA.pkl", mode = "wb") as file:
    resDf.to_pickle(file)
    
with costArtifact.new_file("SAA.pkl", mode = "wb") as file:
    costsPerID_SAA.to_pickle(file)

The next step is necessary to be able to resume computation at any given point without rerunning SAA.

In [None]:
# costsSAA = wandb.Artifact(name = "costsSAA", 
#                           type = "dataset",
#                           description = "Costs per ID of SAA (fitted per ID)")

# with costsSAA.new_file("costsPerID_SAA.pkl", mode = "wb") as file:
#     costsPerID_SAA.to_pickle(file)

# wandb.log_artifact(costsSAA)
    
# wandb.finish()

## SAA Global

In [None]:
config = {'isModellingRun': True,
          'model': 'SAA_global',
          'kFolds': kFolds,
          'nIter': nIter}

wandb.init(project = project, name = "SAA_global", job_type = "modelling", config = config)

In [None]:
SAA_global = SampleAverageApproximation()
SAA_global.fit(y = yTrain)

In [None]:
quantilesDfOneOb = SAA_global.predict(X = None, probs = probs, outputAsDf = True, scalingList = None)

quantilesDf = pd.concat([quantilesDfOneOb] * XTest.shape[0], axis = 0).reset_index(drop = True)
quantilesDf.columns = colnamesQuantile

quantilesDf = (quantilesDf.T * np.array(scalingList)).T

In [None]:
resDf = generateFinalOutput(dataOriginal = data, 
                            dataDecisions = quantilesDf, 
                            targetVariable = 'demand', 
                            mergeOn = None, 
                            variablesToAdd = ['dayIndex', 'date', 'scalingValue'], 
                            scaleBy = 'scalingValue', 
                            includeTraining = False, 
                            sortBy = ['id', 'dayIndex'],
                            longFormat = True)

In [None]:
costsPerID, costsPerSL = getCostsNV(resDf = resDf,
                                    costsPerID_SAA = costsPerID_SAA)

In [None]:
wandb.log(costsPerSL)

with decisionArtifact.new_file("SAA_global.pkl", mode = "wb") as file:
    resDf.to_pickle(file)
    
with costArtifact.new_file("SAA_global.pkl", mode = "wb") as file:
    costsPerID.to_pickle(file)

In [None]:
wandb.log_artifact(decisionArtifact)
wandb.log_artifact(costArtifact)
    
wandb.finish()

## LSx Standard - LGBM

In [None]:
estimatorName = 'LGBM'

### weightsByDistance = False

In [None]:
weightsByDistance = False
model = 'LSx'

LSKDEx = LevelSetKDEx(estimator = estimatorLGBM,
                      weightsByDistance = weightsByDistance)

In [None]:
tunePredictSave(quantileEstimator = LSKDEx,
                combinedCV = False,
                model = model,
                estimatorName = estimatorName)

In [None]:
# tunePredictSave(quantileEstimator = LSKDEx,
#                 combinedCV = True,
#                 model = model,
#                 estimatorName = estimatorName)

### weightsByDistance = True

In [None]:
weightsByDistance = True
model = 'LSx_distWeights'

LSKDEx = LevelSetKDEx(estimator = estimatorLGBM,
                      weightsByDistance = weightsByDistance)

In [None]:
tunePredictSave(quantileEstimator = LSKDEx,
                combinedCV = False,
                model = model,
                estimatorName = estimatorName)

In [None]:
# tunePredictSave(quantileEstimator = LSKDEx,
#                 combinedCV = True,
#                 model = model,
#                 estimatorName = estimatorName)

## LSx Standard - RF

In [None]:
estimatorName = 'RF'

### weightsByDistance = False

In [None]:
weightsByDistance = False
model = 'LSx'

LSKDEx = LevelSetKDEx(estimator = estimatorRF,
                      weightsByDistance = weightsByDistance)

In [None]:
tunePredictSave(quantileEstimator = LSKDEx,
                combinedCV = False,
                model = model,
                estimatorName = estimatorName)

In [None]:
# tunePredictSave(quantileEstimator = LSKDEx,
#                 combinedCV = True,
#                 model = model,
#                 estimatorName = estimatorName)

### weightsByDistance = True

In [None]:
weightsByDistance = True
model = 'LSx_distWeights'

LSKDEx = LevelSetKDEx(estimator = estimatorRF,
                      weightsByDistance = weightsByDistance)

In [None]:
tunePredictSave(quantileEstimator = LSKDEx,
                combinedCV = False,
                model = model,
                estimatorName = estimatorName)

In [None]:
# tunePredictSave(quantileEstimator = LSKDEx,
#                 combinedCV = True,
#                 model = model,
#                 estimatorName = estimatorName)

## LSx NN - LGBM

In [None]:
estimatorName = 'LGBM'
model = 'LSx_NN'
weightsByDistance = True

LSKDEx = LevelSetKDEx_NN(estimator = estimatorLGBM)

In [None]:
tunePredictSave(quantileEstimator = LSKDEx,
                combinedCV = False,
                model = model,
                estimatorName = estimatorName)

In [None]:
# tunePredictSave(quantileEstimator = LSKDEx,
#                 combinedCV = True,
#                 model = model,
#                 estimatorName = estimatorName)

## RF WSAA

In [None]:
estimatorName = 'RF'
model = 'WSAA'
weightsByDistance = False

RFWSAA = RandomForestWSAA()

#---

tunePredictSave(quantileEstimator = RFWSAA,
                combinedCV = False,
                model = model,
                estimatorName = estimatorName)

## RF WSAA - No Tuning

In [None]:
config = {'isModellingRun': True,
          'model': "WSAA",
          'estimator': "RF",
          'weightsByDistance': False,
          'combinedCV': False,
          'kFolds': kFolds,
          'nIter': nIter}
    
wandb.init(project = project, name = "RF_standard", job_type = "modelling", config = config)

In [None]:
RFWSAA = RandomForestWSAA(**paramsRF)

RFWSAA.fit(X = XTrain, y = yTrain)

quantilesDf = RFWSAA.predict(X = XTest, 
                             probs = probs, 
                             outputAsDf = True, 
                             scalingList = scalingList)

quantilesDf.columns = colnamesQuantile

In [None]:
resDf = generateFinalOutput(dataOriginal = data, 
                            dataDecisions = quantilesDf, 
                            targetVariable = 'demand', 
                            mergeOn = None, 
                            variablesToAdd = ['dayIndex', 'date', 'scalingValue'], 
                            scaleBy = 'scalingValue', 
                            includeTraining = False, 
                            sortBy = ['id', 'dayIndex'],
                            longFormat = True)

In [None]:
costsPerID, costsPerSL = getCostsNV(resDf = resDf,
                                    costsPerID_SAA = costsPerID_SAA)

In [None]:
wandb.log(costsPerSL)

updateArtifact(name = 'decisionData',
               dataToAdd = resDf,
               fileNameToAdd = 'RF_standard')
    
updateArtifact(name = 'costData',
               dataToAdd = costsPerID,
               fileNameToAdd = 'RF_stanard') 
    
wandb.finish()