In [154]:
# Import utils
import numpy as np
import pandas as pd
import seaborn as sns
import copy
import time
import datetime as dt
import pickle
import pyreadr
import json
import pickle
import joblib
from joblib import Parallel, delayed
import os
import itertools
import contextlib
from tqdm import tqdm

In [155]:
## Gini coefficient for weights
def gini(array):
    
    """Calculate the Gini coefficient of a numpy array"""
    # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
    # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
   
    array = np.array(array).flatten() #all values are treated equally, arrays must be 1d
    if np.amin(array) < 0:
        array -= np.amin(array) #values cannot be negative
    array += 0.0000001 #values cannot be 0
    array = np.sort(array) #values must be sorted
    index = np.arange(1,array.shape[0]+1) #index per array element
    n = array.shape[0]#number of array elements
   
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array))) #Gini coefficient

In [156]:
## Multivariate CV
def multiVarCV(xi_hat, w_hat):
    
    """

    CV = ((mu^T Sigma mu) / (mu^T mu))^(1/2)

    """
    
    # Reduce to positive weights
    xi_hat = xi_hat[w_hat > 0,:]
    w_hat = w_hat[w_hat > 0]
    
    # Create "weighted" xi_hat by sampling in proportion to w_hat
    xi_hat_weighted = [[xi for i in range(0,int(w))] for (xi, w) in zip(xi_hat, w_hat*10**6)]
    xi_hat_weighted = np.concatenate(xi_hat_weighted, axis=0)
    
    ## CV
    if xi_hat.shape[1] <= 1:
        
        # CV
        cv = np.std(xi_hat.flatten()) / np.mean(xi_hat.flatten())

    else:
        
        # Get covariance matrix
        covMatrix = np.cov(xi_hat, rowvar=False, bias=True)

        # Get sample mean
        sampleMean = np.mean(xi_hat, axis=0)

        # Multi-variate CV
        cv = np.sqrt(
            (sum([sum(sampleMean * x) for x in covMatrix] * sampleMean)) / 
            (sum(sampleMean * sampleMean)**2)
        )
        
    ## Weighted CV
    if xi_hat_weighted.shape[1] <= 1:
        
        # CV
        cv_weighted = np.std(xi_hat_weighted.flatten()) / np.mean(xi_hat_weighted.flatten())

    else:
        
        # Get covariance matrix
        covMatrix_weighted = np.cov(xi_hat_weighted, rowvar=False, bias=True)

        # Get sample mean
        sampleMean_weighted = np.mean(xi_hat_weighted, axis=0)

        # Multi-variate CV
        cv_weighted = np.sqrt(
            (sum([sum(sampleMean_weighted * x) for x in covMatrix_weighted] * sampleMean_weighted)) / 
            (sum(sampleMean_weighted * sampleMean_weighted)**2)
        )

    return cv, cv_weighted

In [157]:
## Sample weights diagnostic
def getSampleWeightsDiagnostic(model,
                               model_params, 
                               weights,
                               y_samples, 
                               y_samples_SKU, 
                               ID_samples, 
                               ID_samples_SKU):
    
    # Samples of unknown variable (i,t)
    if model == 'global':

        xi_hat = np.array(
        y_samples.iloc[list(ID_samples.sale_yearweek < 
                       ID_samples_SKU.loc[model_params['N']].sale_yearweek),
                                     0:model_params['T']])

        xi_hat_names = np.array(
        ID_samples.loc[list(ID_samples.sale_yearweek < 
                       ID_samples_SKU.loc[model_params['N']].sale_yearweek)].SKU_API)

    else:

        xi_hat = np.array(y_samples_SKU.iloc[0:model_params['N'],0:model_params['T']])  
        xi_hat_names = np.array(ID_samples_SKU.loc[model_params['N']].SKU_API)


    # Actuals
    xi_act = np.array(y_samples_SKU.iloc[model_params['N'],0:model_params['T']])      

    # Sample weights (i)
    w_hat = np.array(weights[model_params['t_current']+1])

    # Dataframe storing weights x SKU
    weights_x_t = pd.DataFrame({
        'SKU': np.repeat(model_params['SKU'], w_hat.shape[0]),
        'SKU_API': np.repeat(ID_samples_SKU.SKU_API[0], w_hat.shape[0]),
        'x_SKU_API': xi_hat_names,
        'T_horizon_rolling': np.repeat(model_params['T_horizon_rolling'], w_hat.shape[0]),
        't': np.repeat(model_params['t_current'], w_hat.shape[0]),
        'w_hat': w_hat       
    })


    # Dataframe storing multi variate sample CV
    cv, cv_weighted = multiVarCV(xi_hat, w_hat)
    cv_x_t = pd.DataFrame({
        'SKU': [model_params['SKU']],
        'SKU_API': [ID_samples_SKU.SKU_API[0]],
        'T_horizon_rolling': [model_params['T_horizon_rolling']],
        't': model_params['t_current'],
        'cv': [cv],
        'cv_weighted': [cv_weighted]
    })

    return weights_x_t, cv_x_t

In [158]:
## Function running sample weights diagnostic
def runSampleWeightsDiagnostic(SKU, PATH_SAMPLES, PATH_DATA):

    # Horizons
    T_horizon_rolling_range=range(1,5+1)
    T_horizon = 13

    # Initialize
    res_local_weights = pd.DataFrame()
    res_global_weights = pd.DataFrame()
    res_local_cv = pd.DataFrame()
    res_global_cv = pd.DataFrame()

    # For each rolling horizon
    for T_horizon_rolling in T_horizon_rolling_range:

        # Get weights
        with open(PATH_SAMPLES+'/SKU'+str(SKU)+'/Static/Weights'+
                  str(T_horizon_rolling)+'/weights_local_ij.p', 'rb') as f:
            weights_local_ij = pickle.load(f)
        del f
        
        with open(PATH_SAMPLES+'/SKU'+str(SKU)+'/Static/Weights'+
                  str(T_horizon_rolling)+'/weights_global_ij.p', 'rb') as f:
            weights_global_ij = pickle.load(f)
        del f

        # Get samples
        robj = pyreadr.read_r(PATH_SAMPLES+'/SKU'+str(SKU)+'/Static/TmpFiles'+
                              str(T_horizon_rolling)+'/Y_samples_mv_k.RDS')
        y_samples_SKU = robj[None]
        
        robj = pyreadr.read_r(PATH_DATA+'/Y_Data_mv_NEW.RData')
        y_samples = robj['Y_Data_mv']

        robj = pyreadr.read_r(PATH_SAMPLES+'/SKU'+str(SKU)+'/Static/TmpFiles'+
                              str(T_horizon_rolling)+'/ID_samples_k.RDS')
        ID_samples_SKU = robj[None]

        robj = pyreadr.read_r(PATH_DATA+'/ID_Data_NEW.RData')
        ID_samples = robj['ID_Data']

        # Get sampling
        robj = pyreadr.read_r(PATH_SAMPLES+'/SKU'+str(SKU)+'/Static/TmpFiles'+
                              str(T_horizon_rolling)+'/sampleUpTo_start.RDS')
        sampleUpTo_start = robj[None].iloc[0,0]


        ## Iterate over full time horizon
        for t_current in range(T_horizon): 

            # Set current model params
            model_params = dict({
                'SKU': SKU,
                'T_horizon_rolling': T_horizon_rolling,
                'T': min(T_horizon_rolling,T_horizon-t_current),
                't_current': t_current,
                'N': int(sampleUpTo_start+t_current)
            })    
            
            # Diagnostic results - local
            weights_x_t, cv_x_t = getSampleWeightsDiagnostic(
                'local',
                model_params,  
                weights_local_ij,
                y_samples, 
                y_samples_SKU, 
                ID_samples, 
                ID_samples_SKU
            )   
            
            if not res_local_weights.empty:
                res_local_weights = res_local_weights.append(weights_x_t)   
                res_local_cv = res_local_cv.append(cv_x_t)   
            else:
                res_local_weights = pd.DataFrame(weights_x_t) 
                res_local_cv = pd.DataFrame(cv_x_t) 
                                   
            
            # Diagnostic results - global
            weights_x_t, cv_x_t = getSampleWeightsDiagnostic(
                'global',
                model_params,  
                weights_global_ij,
                y_samples, 
                y_samples_SKU, 
                ID_samples, 
                ID_samples_SKU
            )
             
            if not res_global_weights.empty:
                res_global_weights = res_global_weights.append(weights_x_t)   
                res_global_cv = res_global_cv.append(cv_x_t)   
            else:
                res_global_weights = pd.DataFrame(weights_x_t) 
                res_global_cv = pd.DataFrame(cv_x_t) 
     

                
    ## Summarise
    
    # Weights
    global_weights = res_global_weights[['SKU', 'T_horizon_rolling', 't', 'w_hat']]
    local_weights = res_local_weights[['SKU', 'T_horizon_rolling', 't', 'w_hat']]   
    
    # Gini coefficient
    global_weights_gini = res_global_weights[
        ['SKU', 'T_horizon_rolling', 't', 'w_hat']].groupby(
        ['SKU', 'T_horizon_rolling', 't']).agg(
        gini_coeff = ('w_hat', gini),
        gini_coeff_pos = ('w_hat', lambda x: gini(x.loc[x>0]))).reset_index()
    local_weights_gini = res_local_weights[
        ['SKU', 'T_horizon_rolling', 't', 'w_hat']].groupby(
        ['SKU', 'T_horizon_rolling', 't']).agg(
        gini_coeff = ('w_hat', gini),
        gini_coeff_pos = ('w_hat', lambda x: gini(x.loc[x>0]))).reset_index()
    
     # Weights x SKU - global
    global_weights_xSKU = res_global_weights.groupby(
        ['SKU', 'SKU_API', 'x_SKU_API', 'T_horizon_rolling', 't']
    ).agg(

        w_hat = ('w_hat', sum),
        n_weights = ('w_hat', len),
        n_nonZeroWeights = ('w_hat', lambda w: sum(w>0))

        ).groupby(
        ['SKU', 'SKU_API', 'x_SKU_API', 'T_horizon_rolling']
    ).agg(

        w_hat = ('w_hat', np.mean),
        n_weights = ('n_weights', np.mean),
        n_nonZeroWeights = ('n_nonZeroWeights', np.mean)

    ).reset_index()
    
    # Weights x SKU - local
    local_weights_xSKU = res_local_weights.groupby(
        ['SKU', 'SKU_API', 'x_SKU_API', 'T_horizon_rolling', 't']
    ).agg(

        w_hat = ('w_hat', sum),
        n_weights = ('w_hat', len),
        n_nonZeroWeights = ('w_hat', lambda w: sum(w>0))

        ).groupby(
        ['SKU', 'SKU_API', 'x_SKU_API', 'T_horizon_rolling']
    ).agg(

        w_hat = ('w_hat', np.mean),
        n_weights = ('n_weights', np.mean),
        n_nonZeroWeights = ('n_nonZeroWeights', np.mean)

    ).reset_index()
    
    # Demand CV
    global_cv = copy.deepcopy(res_global_cv)
    local_cv = copy.deepcopy(res_local_cv)
    
    # Retun
    return (
        local_weights, 
        global_weights, 
        local_weights_xSKU,
        global_weights_xSKU, 
        local_weights_gini,
        global_weights_gini,
        local_cv, 
        global_cv
    )

In [159]:
## Parallel process tracker
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [160]:
## Run

# Set folder names
PATH_DATA = '/home/fesc/Data'
PATH_PARAMS  = '/home/fesc/Data/Params'
PATH_SAMPLES = '/home/fesc/Data/Samples'
PATH_RESULTS = '/home/fesc/Data/Results'


# SKUs
SKU_range=range(1,460+1)

# Cores
n_jobs=32

# Run
with tqdm_joblib(tqdm(desc="Progress", total=len(SKU_range))) as progress_bar:
    results = Parallel(n_jobs=n_jobs)(delayed(runSampleWeightsDiagnostic)(SKU, 
                                                                          PATH_SAMPLES,
                                                                          PATH_DATA) 
                                      for SKU in SKU_range)

Progress: 100%|██████████| 460/460 [11:50<00:00,  1.55s/it] 


In [161]:
## Aggregate results
# result_local_weights = pd.DataFrame()
# result_global_weights = pd.DataFrame()
# result_local_weights_xSKU = pd.DataFrame()
# result_global_weights_xSKU = pd.DataFrame()
# result_local_weights_gini = pd.DataFrame()
# result_global_weights_gini = pd.DataFrame()
result_local_cv = pd.DataFrame()
result_global_cv = pd.DataFrame()

for result in results:

    # Unpack
    (
        local_weights, 
        global_weights, 
        local_weights_xSKU,
        global_weights_xSKU, 
        local_weights_gini,
        global_weights_gini,
        local_cv, 
        global_cv
    ) = result

    ## Append
#     local_weights['n']=1
#     result_local_weights = result_local_weights.append(local_weights).groupby(['w_hat']).agg(
#         n = ('n', sum)).reset_index()
    
#     global_weights['n']=1
#     result_global_weights = result_global_weights.append(global_weights).groupby(['w_hat']).agg(
#         n = ('n', sum)).reset_index()
         
#     result_local_weights_xSKU = result_local_weights_xSKU.append(local_weights_xSKU)
#     result_global_weights_xSKU = result_global_weights_xSKU.append(global_weights_xSKU)
    
#     result_local_weights_gini = result_local_weights_gini.append(local_weights_gini)
#     result_global_weights_gini = result_global_weights_gini.append(global_weights_gini)
        
    result_local_cv = result_local_cv.append(local_cv)
    result_global_cv = result_global_cv.append(global_cv)

In [162]:
## Save results
# result_local_weights.to_csv(
#     path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_local_allWeights.csv' , 
#     sep=','
# )
# result_global_weights.to_csv(
#     path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_global_allWeights.csv' , 
#     sep=','
# )
# result_local_weights_xSKU.to_csv(
#     path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_local_weightsSKU.csv' , 
#     sep=','
# )
# result_global_weights_xSKU.to_csv(
#     path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_global_weightsSKU.csv' , 
#     sep=','
# )

# result_local_weights_gini.to_csv(
#     path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_local_weightsGini.csv' , 
#     sep=','
# )

# result_global_weights_gini.to_csv(
#     path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_global_weightsGini.csv' , 
#     sep=','
# )

result_local_cv.to_csv(
    path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_local_demandCV.csv' , 
    sep=','
)

result_global_cv.to_csv(
    path_or_buf=PATH_RESULTS+'/WeightsDiagnostic_global_demandCV.csv' , 
    sep=','
)