In [None]:
# Import utils
import numpy as np
import pandas as pd
import copy
import time
import datetime as dt
import pickle
import json
import joblib
from joblib import dump, load
import os
import pyreadr

from sklearn.preprocessing import MinMaxScaler

In [None]:
import WeightsKernel
from WeightsKernel import RandomForestWeightsKernel
from WeightsKernel import PreProcessor

In [None]:
#### Reshaping based on rolling horizon (iid)
def reshape_data(ID_Data_train, X_Data_train, Y_Data_train, test_start, tau, iid = True):

    """

    Function that creates training data based on start of test horizon (indicated by the first sale_yearweek
    of the test horizon) and given the rolling horizon reshapes the data as consecutive demand vectors
    whose length equals the length of the rolling horizon. If iid == True then reshapes such that the
    rows of the data are iid in the sense that consecutive demand vectors do not overlap.

    Inputs:

        ID_Data_train: data frame storing identifiers
        X_Data_train: feature data frame
        Y_Data_train: demand data frame
        test_start: start period of test horizon
        tau: look-ahead
        iid: True if data should be reshaped to iid

    Outputs: 

        id_train: reshaped data frame storing identifiers
        X_train: reshaped feature data as np.array 
        y_train: reshaped demand data as np.array 


    """


    # Combine IDs with Y data, sort by SKU and sale yearweek, and select training data
    data = pd.concat([ID_Data_train, Y_Data_train], axis=1)

    # Create look-aheads given rolling horizon
    Y = {}
    for tau in range(0,tau+1):
        Y['Y'+str(tau)] = data.groupby(['SKU']).shift(-tau)['Y']

    Y_Data_train = pd.DataFrame(Y)

    # Remove look-ahead that would run over training horizon
    ID_Data_train = ID_Data_train.loc[~np.isnan(np.array(Y_Data_train.iloc[:,0:(tau+1)])).any(axis=1)]
    X_Data_train = X_Data_train.loc[~np.isnan(np.array(Y_Data_train.iloc[:,0:(tau+1)])).any(axis=1)]
    Y_Data_train = Y_Data_train.loc[~np.isnan(np.array(Y_Data_train.iloc[:,0:(tau+1)])).any(axis=1)]

       
    ## Reshape to iid
    if iid:

        ## Get slices of sale yearweeks to ensure iid data
        max_sale_yearweek = min(max(ID_Data_train.sale_yearweek),test_start-(tau+1))

        slices = []
        factor=0
        step=0

        # Get every T_horizon_rolling'th sale_yearweek starting from the last sale_yearweek
        while max_sale_yearweek - step > 0:
            factor = factor + 1
            slices = slices + [max_sale_yearweek - step]
            step = (tau+1) * factor        

        # Apply slices
        Y_Data_train = Y_Data_train.loc[ID_Data_train.sale_yearweek.isin(slices)]
        X_Data_train = X_Data_train.loc[ID_Data_train.sale_yearweek.isin(slices)]
        ID_Data_train = ID_Data_train.loc[ID_Data_train.sale_yearweek.isin(slices)]

    
    # Tansfrom data to arrays
    id_train = ID_Data_train
    X_train = np.array(X_Data_train)
    y_train = np.array(Y_Data_train)
    y_train = y_train.flatten() if y_train.shape[1] == 1 else y_train

    # Return 
    return id_train, X_train, y_train

In [None]:
#### Reshaping based on rolling horizon (iid)
def reshape_data(data, timePeriods, maxTimePeriod, tau, iid = True):


 
    ## Reshape to iid
    if iid:

        ## Get slices of timePeriods
        maxTimePeriod = min(max(timePeriods), maxTimePeriod-tau)

        slices = []
        factor = 0
        step = 0
        
        while maxTimePeriod - step > 0:
            factor = factor + 1
            slices = slices + [maxTimePeriod - step]
            step = (tau+1) * factor        

        # Apply slices
        sliced_data = data.loc[timePeriods.isin(slices)]

    # Return 
    return sliced_data

In [None]:
def scale_variables(vars_to_scale, vars_to_scale_with, scaler):
    
    scaler_fitted = scaler.fit(vars_to_scale_with)
    
    vars_scaled = scaler_fitted.transform(vars_to_scale)

    return vars_scaled, scaler_fitted

In [None]:
# Set folder names as global variables
os.chdir('/home/fesc/MM/')
global PATH_DATA, PATH_PARAMS, PATH_SAMPLES, PATH_RESULTS

PATH_DATA = '/home/fesc/MM/Data'
PATH_PARAMS  = '/home/fesc/MM/Data/Params'
PATH_KERNELS = '/home/fesc/MM/Data/Kernels'
PATH_SAMPLES = '/home/fesc/MM/Data/Samples'
PATH_RESULTS = '/home/fesc/MM/Data/Results'

In [None]:
# Time period and SKU ranges
T = 13
Tau = 5
t_range = range(0,T)
tau_range = range(0,Tau)
SKU_range = range(1,460+1)

# Train/test split
test_start = 114

In [None]:
ID_Data = pd.read_csv(PATH_DATA+'/ID_Data.csv')
X_Data = pd.read_csv(PATH_DATA+'/X_Data.csv')
X_Data_Columns = pd.read_csv(PATH_DATA+'/X_Data_Columns.csv')
Y_Data = pd.read_csv(PATH_DATA+'/Y_Data.csv')

In [None]:
# Selection of training data
ID_Data_train = ID_Data.loc[ID_Data.sale_yearweek < test_start]
X_Data_train = X_Data.loc[ID_Data.sale_yearweek < test_start]
Y_Data_train = Y_Data.loc[ID_Data.sale_yearweek < test_start]

In [None]:
## Features

# Prep scaling
vars_to_scale_names = X_Data_Columns.loc[X_Data_Columns.Scale == 'YES', 'Feature'].values
vars_to_scale_with_names = X_Data_Columns.loc[X_Data_Columns.Scale == 'YES', 'ScaleWith'].values

vars_to_scale = np.array(X_Data[vars_to_scale_names])
vars_to_scale_with = np.array(X_Data_train[vars_to_scale_with_names])

scaler = MinMaxScaler()

# Scale
vars_scaled, scaler_fitted = scale_variables(vars_to_scale, vars_to_scale_with, scaler)

# Add back column names
vars_scaled = pd.DataFrame(vars_scaled, columns=vars_to_scale_names)

# Scaled features
X_Data_scaled = copy.deepcopy(X_Data)
for col in vars_scaled.columns:
    X_Data_scaled[col] = vars_scaled[col]

In [None]:
## Demand

# Prep scaling
vars_to_scale = np.array(Y_Data)
vars_to_scale_with = np.array(Y_Data_train)

scaler = MinMaxScaler()

# Scale
vars_scaled, scaler_fitted = scale_variables(vars_to_scale, vars_to_scale_with, scaler)
Y_Data_scaled = pd.DataFrame(vars_scaled, columns=['Y'])

In [None]:
### Reshaping for multi-period modelling

In [None]:
# Selection of training data for scaled data
X_Data_scaled_train = X_Data_scaled.loc[ID_Data.sale_yearweek < test_start]
Y_Data_scaled_train = Y_Data_scaled.loc[ID_Data.sale_yearweek < test_start]

In [None]:
# Reshape demand to multi-period
data = pd.concat([ID_Data_train, Y_Data_scaled_train], axis=1)
Y = {}
for tau in tau_range:
    Y['Y'+str(tau)] = data.groupby(['SKU']).shift(-tau)['Y']
    
Y_Data_scaled_train = pd.DataFrame(Y)

In [None]:
## Set paths and names
path_cv = '/home/fesc/MM/Data/Kernels'
name_cv = 'cv_rfwk_global_scaled'
path_kernel = '/home/fesc/MM/Data/Kernels'
name_kernel = 'rfwk_global_scaled'

In [None]:
## Set parameters to tune random forest weights kernels
model_params = {
    'oob_score': True,
    'random_state': 12345,
    'n_jobs': 4,
    'verbose': 0
}

hyper_params_grid = {
    'n_estimators': [1000],
    'max_depth': [None],
    'min_samples_split': [x for x in range(20, 1000, 20)],  
    'min_samples_leaf': [x for x in range(10, 1000, 10)],  
    'max_features': [x for x in range(8, 256, 8)],   
    'max_leaf_nodes': [None],
    'min_impurity_decrease': [0.0],
    'bootstrap': [True],
    'max_samples': [0.75, 0.80, 0.85, 0.90, 0.95, 1.00]
}    


tuning_params = {     
    'random_search': True,
    'n_iter': 100,
    'scoring': {'MSE': 'neg_mean_squared_error'},
    'return_train_score': True,
    'refit': 'MSE',
    'random_state': 12345,
    'n_jobs': 8,
    'verbose': 2
}    

In [None]:
## Tune random forest weights kernels

# For tau=0,...,4
for tau in tau_range:

    # Tansfrom data to arrays
    X_train = np.array(X_Data_scaled_train)
    y_train = np.array(Y_Data_scaled_train.iloc[:,0:(tau+1)])

    # Remove look-ahead that would run over training horizon
    id_train = ID_Data_train.loc[~np.isnan(y_train).any(axis=1),:]
    X_train = X_train[~np.isnan(y_train).any(axis=1),:]
    y_train = y_train[~np.isnan(y_train).any(axis=1),:]
    y_train = y_train.flatten() if y_train.shape[1] == 1 else y_train
    
    # Initialize
    prep = PreProcessor()
    kernel = RandomForestWeightsKernel()

    # CV folds
    cv_folds = prep.split_timeseries_cv(n_splits=3, timePeriods=id_train.sale_yearweek)
    
    # CV search
    cv_results = kernel.tune(X=X_train, y=y_train, cv_folds=cv_folds, model_params=model_params, 
                             tuning_params=tuning_params, hyper_params_grid=hyper_params_grid)
    
    # Save
    kernel.save_cv_result(path=path_cv+'/'+name_cv+'_tau'+str(tau)+'.joblib')

In [None]:
## Fit random forest weights kernels and generate weights

# For period t=1,...,T
for t in t_range:
    
    # Selection of training data
    ID_Data_train = ID_Data.loc[ID_Data.sale_yearweek < test_start + t]
    X_Data_scaled_train = X_Data_scaled.loc[ID_Data.sale_yearweek < test_start + t]
    Y_Data_scaled_train = Y_Data_scaled.loc[ID_Data.sale_yearweek < test_start + t]

    # Selection of test data
    ID_Data_test = ID_Data.loc[ID_Data.sale_yearweek == test_start + t]
    X_Data_scaled_test = X_Data_scaled.loc[ID_Data.sale_yearweek == test_start + t]
    Y_Data_scaled_test = Y_Data_scaled.loc[ID_Data.sale_yearweek == test_start + t]

    # Adjust tau's to account of end of planning horizon
    tau_range = range(0,min(max(tau_range),T-t-1)+1)
    
    # For tau=0,...,4
    for tau in tau_range:
        
        # Print status
        start_time = dt.datetime.now().replace(microsecond=0)
        print('## Period = '+str(t+1)+', rolling horizon = '+str(tau+1)+': Fitting random forest weights kernel...')
        
        # Tansfrom data to arrays
        X_train, X_test = np.array(X_Data_scaled_train), np.array(X_Data_scaled_test)
        y_train, y_test = np.array(Y_Data_scaled_train.iloc[:,0:(tau+1)]), np.array(Y_Data_scaled_test.iloc[:,0:(tau+1)])
        
        # Remove look-ahead that would run over training horizon
        id_train = ID_Data_train.loc[~np.isnan(y_train).any(axis=1),:]
        X_train = X_train[~np.isnan(y_train).any(axis=1),:]
        y_train = y_train[~np.isnan(y_train).any(axis=1),:]
        
        # Reshape y
        y_train = y_train.flatten() if y_train.shape[1] == 1 else y_train
        y_test = y_test.flatten() if y_test.shape[1] == 1 else y_test
                
        # Initialize weights kernel
        kernel = RandomForestWeightsKernel()

        # Load cv results
        kernel.load_cv_result(path=path_cv+'/'+name_cv+'_tau'+str(tau)+'.joblib')
    
        # Fit random forest weights kernel
        kernel.fit(X=X_train, y=y_train, model_params={'n_jobs': 32, 'verbose': 1})
        
        # Save fit
        kernel.save_fit(path=path_kernel+'/'+name_kernel+'_t'+str(t+1)+'_tau'+str(tau)+'.joblib')
        
        # Print status
        print('## ... fit took', dt.datetime.now().replace(microsecond=0) - start_time)         
        
        # Get weights

        # Get samples 

        # Get weights diagnostics (RMSSE, SPL, etc. ...?)

In [1]:
T=13
Tau=4


# Fit weights model and generate weights for period t=1,...,T
for t in range(0,T):
    
    # Adjust tau's to account of end of planning horizon
    tau_range = range(0,min(max(range(0,Tau+1)),T-t-1)+1)
    
    # For tau=0,...,4
    for tau in tau_range:
        
        # Status
        print('#### Period = '+str(t+1)+', rolling horizon = '+str(tau+1)+':')          

#### Period = 1, rolling horizon = 1:
#### Period = 1, rolling horizon = 2:
#### Period = 1, rolling horizon = 3:
#### Period = 1, rolling horizon = 4:
#### Period = 1, rolling horizon = 5:
#### Period = 2, rolling horizon = 1:
#### Period = 2, rolling horizon = 2:
#### Period = 2, rolling horizon = 3:
#### Period = 2, rolling horizon = 4:
#### Period = 2, rolling horizon = 5:
#### Period = 3, rolling horizon = 1:
#### Period = 3, rolling horizon = 2:
#### Period = 3, rolling horizon = 3:
#### Period = 3, rolling horizon = 4:
#### Period = 3, rolling horizon = 5:
#### Period = 4, rolling horizon = 1:
#### Period = 4, rolling horizon = 2:
#### Period = 4, rolling horizon = 3:
#### Period = 4, rolling horizon = 4:
#### Period = 4, rolling horizon = 5:
#### Period = 5, rolling horizon = 1:
#### Period = 5, rolling horizon = 2:
#### Period = 5, rolling horizon = 3:
#### Period = 5, rolling horizon = 4:
#### Period = 5, rolling horizon = 5:
#### Period = 6, rolling horizon = 1:
#### Period 

In [None]:
## ToDo: add CPU timing, add options for iid reshaping yes/no, scaling yes/no
## tbc: extract/prep samples, add weights diagnostic (e.g., RMSSE, SPL)

In [None]:
# Get weights
weights = kernel.apply(X_test)    

# Get weights diagnostics (RMSSE, SPL, etc. ...?) ...

# Save weights, samples, diagnostics ...

In [None]:
######################### IID #########################

In [None]:
## Set paths and names
path_cv = '/home/fesc/MM/Data/Kernels'
name_cv = 'cv_rfwk_global_scaled_iid'
path_kernel = '/home/fesc/MM/Data/Kernels'
name_kernel = 'rfwk_global_scaled_iid'

In [None]:
## Set parameters to tune random forest weights kernels
model_params = {
    'oob_score': True,
    'random_state': 12345,
    'n_jobs': 4,
    'verbose': 0
}

hyper_params_grid = {
    'n_estimators': [1000],
    'max_depth': [None],
    'min_samples_split': [x for x in range(20, 1000, 20)],  
    'min_samples_leaf': [x for x in range(10, 1000, 10)],  
    'max_features': [x for x in range(8, 256, 8)],   
    'max_leaf_nodes': [None],
    'min_impurity_decrease': [0.0],
    'bootstrap': [True],
    'max_samples': [0.75, 0.80, 0.85, 0.90, 0.95, 1.00]
}    


tuning_params = {     
    'random_search': True,
    'n_iter': 100,
    'scoring': {'MSE': 'neg_mean_squared_error'},
    'return_train_score': True,
    'refit': 'MSE',
    'random_state': 12345,
    'n_jobs': 8,
    'verbose': 2
}    

In [None]:
## Tune random forest weights kernels

# Time period and SKU ranges
T = 13
Tau = 5
t_range = range(0,T)
tau_range = range(0,Tau)
SKU_range = range(1,460+1)

# Train/test split
test_start = 114

# For tau=0,...,4
for tau in tau_range:
    
    # Reshape
    id_train = reshape_data(data=ID_Data_train, timePeriods=ID_Data_train.sale_yearweek, maxTimePeriod=test_start-1, tau=tau, iid=True)
    X_train = reshape_data(data=X_Data_scaled_train, timePeriods=ID_Data_train.sale_yearweek, maxTimePeriod=test_start-1, tau=tau, iid=True)
    y_train = reshape_data(data=Y_Data_scaled_train, timePeriods=ID_Data_train.sale_yearweek, maxTimePeriod=test_start-1, tau=tau, iid=True)
    
    # Tansfrom data to arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train.iloc[:,0:(tau+1)])
    y_train = y_train.flatten() if y_train.shape[1] == 1 else y_train    
    
    # Initialize
    prep = PreProcessor()
    kernel = RandomForestWeightsKernel()

    # CV folds
    cv_folds = prep.split_timeseries_cv(n_splits=3, timePeriods=id_train.sale_yearweek)
    
    # CV search
    cv_results = kernel.tune(X=X_train, y=y_train, cv_folds=cv_folds, model_params=model_params, 
                             tuning_params=tuning_params, hyper_params_grid=hyper_params_grid)
    
    # Save
    kernel.save_cv_result(path=path_cv+'/'+name_cv+'_tau'+str(tau)+'.joblib')

In [None]:
## Fit random forest weights kernels and generate weights

# Time period and SKU ranges
T = 13
Tau = 5
t_range = range(0,T)
tau_range = range(0,Tau)
SKU_range = range(1,460+1)

# Train/test split
test_start = 114

# Initialize
weights = {}
exec_time_sec = {}
cpu_time_sec = {}


# For period t=1,...,T
for t in t_range:
    
    # Selection of training data
    ID_Data_train = ID_Data.loc[ID_Data.sale_yearweek < test_start + t]
    X_Data_scaled_train = X_Data_scaled.loc[ID_Data.sale_yearweek < test_start + t]
    Y_Data_scaled_train = Y_Data_scaled.loc[ID_Data.sale_yearweek < test_start + t]

    # Selection of test data
    ID_Data_test = ID_Data.loc[ID_Data.sale_yearweek == test_start + t]
    X_Data_scaled_test = X_Data_scaled.loc[ID_Data.sale_yearweek == test_start + t]
    Y_Data_scaled_test = Y_Data_scaled.loc[ID_Data.sale_yearweek == test_start + t]

    # Adjust tau's to account of end of planning horizon
    tau_range = range(0,min(max(tau_range),T-t-1)+1)
    
    # Initialize
    weights[t] = {}
    exec_time_sec[t] = {}
    cpu_time_sec[t] = {}
    
    # For tau=0,...,4
    for tau in tau_range:
        
        # Status
        print('#### Period = '+str(t+1)+', rolling horizon = '+str(tau+1)+':')
        print('# Fitting random forest weights kernel...')
        
        # Timer
        exec_time_sec[t][tau] = {}
        cpu_time_sec[t][tau] = {}
        start_time = dt.datetime.now().replace(microsecond=0)
        st_exec = time.time()
        st_cpu = time.process_time()  
        
        # Reshape
        id_train = reshape_data(data=ID_Data_train, timePeriods=ID_Data_train.sale_yearweek, maxTimePeriod=test_start-1, tau=tau, iid=True)
        X_train = reshape_data(data=X_Data_scaled_train, timePeriods=ID_Data_train.sale_yearweek, maxTimePeriod=test_start-1, tau=tau, iid=True)
        y_train = reshape_data(data=Y_Data_scaled_train, timePeriods=ID_Data_train.sale_yearweek, maxTimePeriod=test_start-1, tau=tau, iid=True)

        # Tansfrom data to arrays
        X_train, X_test = np.array(X_train), np.array(X_Data_scaled_test)
        y_train, y_test = np.array(y_train.iloc[:,0:(tau+1)]), np.array(Y_Data_scaled_test.iloc[:,0:(tau+1)])
        y_train = y_train.flatten() if y_train.shape[1] == 1 else y_train    
        y_test = y_test.flatten() if y_test.shape[1] == 1 else y_test
                
        # Initialize weights kernel
        kernel = RandomForestWeightsKernel()

        # Load cv results
        kernel.load_cv_result(path=path_cv+'/'+name_cv+'_tau'+str(tau)+'.joblib')
    
        # Fit random forest weights kernel
        kernel.fit(X=X_train, y=y_train, model_params={'n_jobs': 32, 'verbose': 1})
        
        # Save fit
        kernel.save_fit(path=path_kernel+'/'+name_kernel+'_t'+str(t+1)+'_tau'+str(tau)+'.joblib')
        
        # Timer
        end_time = dt.datetime.now().replace(microsecond=0) - start_time
        exec_time_sec[t][tau]['fit'] = time.time()-st_exec
        cpu_time_sec[t][tau]['fit'] = time.process_time()-st_cpu

        # Status
        print('# ...done in', end_time)         
        print('# Generating weights...')
        
        # Timer
        start_time = dt.datetime.now().replace(microsecond=0)
        st_exec = time.time()
        st_cpu = time.process_time()  
        
        # Get weights
        weights[t][tau] = kernel.apply(X_test, model_params={'n_jobs': 32, 'verbose': 0})    

        # Timer
        end_time = dt.datetime.now().replace(microsecond=0) - start_time
        exec_time_sec[t][tau]['weights'] = time.time()-st_exec
        cpu_time_sec[t][tau]['weights'] = time.process_time()-st_cpu
        
        # Status
        print('# ...done in', end_time)     
        
# Save results
joblib.dump(weights, path_kernel+'/'+name_kernel+'_weights.joblib')    
joblib.dump(exec_time_sec, path_kernel+'/'+name_kernel+'_weights_exec_time_sec.joblib')  
joblib.dump(cpu_time_sec, path_kernel+'/'+name_kernel+'_weights_cpu_time_sec.joblib')  