In [1]:
from datetime import date, timedelta, datetime, date
from dateutil.relativedelta import relativedelta
import time
import os, gc, re; 
from glob import glob
gc.enable();

import pandas as pd
import numpy as np
import math
from random import randint
from numba import jit
from contextlib import contextmanager

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import xgboost as xgb

import unittest
import logging

import warnings
warnings.filterwarnings("ignore")
glob("./Data/*.csv")

['./Data/bd_gid_map_RP_202211041511.csv',
 './Data/RP_truedemand.csv',
 './Data/model_base_RP_PH_3category.csv']

In [2]:
normalize = False
if normalize:
    truedf = pd.read_csv([i for i in glob("./Data/*.csv") if 'true' in i ][0], parse_dates=["report_date"],
                         converters={'qty': lambda u: np.log1p(float(u)) if float(u) > 0 else 0.1})
else:
    truedf = pd.read_csv([i for i in glob("./Data/*.csv") if 'true' in i ][0], parse_dates=["report_date"])
    
basedf = pd.read_csv([i for i in glob("./Data/*.csv") if 'base' in i ][0], parse_dates=["fcst_date"])
gid_map = pd.read_csv([i for i in glob("./Data/*.csv") if 'map' in i ][0])


In [3]:
Forecate_months = 3
preparation_period = 2
base_car = ['CATEGORY','CATEGORY_DETAIL','PRDTLN','MODEL']
base_var = ['PN','CATEGORY','CATEGORY_DETAIL','PRDTLN','STDCOST_USD','MODEL']

start_date = str(sorted(truedf.report_date)[0])
simulation = sorted([i for i in set(list(basedf.fcst_date)) if i > date(2021, 10, 31) ])
train_period = simulation[0] +  relativedelta(months= -21)
prediction_period = str(simulation[-1] +  relativedelta(months=preparation_period ) + relativedelta(months= Forecate_months))


In [4]:
reg_param= {'reg:squarederror':'rmse','reg:squaredlogerror':'rmsle','reg:pseudohubererror':'mphe', 
         'reg:gamma':'gamma-deviance','reg:tweedie':'tweedie-nloglik','count:poisson':'poisson-nloglik'}

class CFG:
    __slots__ = ['booster','obj','huber_slope','rate_drop','max_depth', 'tree_method', 'l_rate', 'subsample','col_sample',
                 'l2','num_threads','eval_metric','importance_type','num_boost_round','early_stop','verbose_eval',
                 'verbose','max_bin']
    def __init__(self):
        self.obj = 'reg:pseudohubererror'
        self.huber_slope = 0.15
        self.max_depth =  5
        self.tree_method = 'gpu_hist'
        self.l_rate = 0.07
        self.subsample = 0.85
        self.col_sample = 0.75
        self.max_bin = 512
        self.l2 = 0.9
        self.rate_drop = 0.3
        self.num_threads = 15
        self.eval_metric = reg_param[self.obj]
        self.importance_type = 'total_gain'
        self.num_boost_round = 500000
        self.early_stop = 1000
        self.verbose_eval = 5000
        self.verbose = 2

In [5]:
class TestMethods(unittest.TestCase):
    
    def test_isInstance(self):
        message = "given object is not instance of My definition."
        self.assertIsInstance(pd.DataFrame(), type(truedf), message)
        
    def test_trainingDate(self):
        self.assertGreaterEqual(train_period, truedf.report_date.min())
    
    def test_isMember(self):
        for i in base_var:
            self.assertIn(i, basedf.columns)
            
    def test_Params(self):
        test = CFG()
        self.assertIsInstance(test.obj, str)
        self.assertIsInstance(test.eval_metric, str)
        self.assertGreaterEqual(os.cpu_count(), test.num_threads)

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

....
----------------------------------------------------------------------
Ran 4 tests in 0.004s

OK


In [6]:
@contextmanager
def timer(name:str):
    t0 = time.time()
    yield
    print(f'{name} done in {time.time() - t0:.3f} sec')
    print(f'{name} done in {(time.time() - t0)/60} min')
    
@jit
def smape(y_true, y_pred):
    return 200 * sum(np.abs(y_true - y_pred))/sum(np.abs(y_true) + np.abs(y_pred))/len(y_true)
@jit
def mape_val(y_true, y_pred):
    return sum(np.abs(y_true - y_pred))/sum(y_true) 

def preprocess(base:pd.DataFrame, truedf:pd.DataFrame, base_car:list):   
 
    base['PN'] = base['PN'].apply(lambda x: str(x).replace("|", " ").split())
    base = base.explode('PN')
    base['STDCOST_USD'] = base['STDCOST_USD'].apply(lambda x: int(x)) 
    base_pn = base[base_var]
    base_car = base_car
    le = LabelEncoder()
    for i in base_car:
        base_pn[i] = le.fit_transform(base_pn[i].values)
    base_pn = base_pn.drop_duplicates()
    base_pn = base_pn.set_index('PN')
    
    truedf.rename(columns={'pn': 'PN'}, inplace=True)
    dates = pd.date_range(start=start_date, end = prediction_period, freq='1D')
    
    df = base_pn.merge(truedf, on='PN', how='left')
    df = df.groupby(['PN', pd.Grouper(key='report_date', freq='1M')])['qty'].sum().unstack(level=-1).fillna(0)
    df = df.T.reindex(dates).T.fillna(0)
    df2 = df.join(base_pn)
    assert df.shape[0] == df2.shape[0],"shape should be the same"
    
    return df, df2

def post_pro(test_pred, df, t_val_begin, Forecate_months, walk_target, normalize = False):
    sub = np.array(test_pred).clip(0.).transpose()
    df_sub = pd.DataFrame(sub, index=df.index,
        columns=pd.date_range(t_fcst_begin, periods=walk_target, freq = '1M')).stack().to_frame("qty")
    if normalize:
        df_sub["qty"] =np.expm1(df_sub["qty"]).clip(0.)

    dd = df_sub.unstack(level=-1)
    dd.columns = dd.columns.get_level_values(1)
    dd = dd.reset_index()
    dd.columns=[ "M"+str(i) for i in range(0, Forecate_months + 1)]
    dd.rename(columns={'M0':'PN'},inplace=True)
    dd['fsct_date'] = str(t_val_begin  +  relativedelta(months=walk_target))
    print(f"Datas fsct_date { str(t_val_begin  +  relativedelta(months=walk_target))}")

    return dd
   
def simu(gid_map, result, postfix='baby'):
    gid_map.rename(columns={'pn':'PN'},inplace=True)
    sub_final = gid_map.merge(result, on='PN')
    final_list = ['gid','fsct_date']
    b = ['M'+ str(i) for i in range(1, Forecate_months + 1)]
    final_list.extend(b)
    sub_final = sub_final[final_list]
    
    sim = sub_final.groupby(['gid','fsct_date'])[['M1']].sum().reset_index()
    
    for i in range(2, Forecate_months + 1):
        sim = sim.merge(sub_final.groupby(['gid','fsct_date'])[['M'+ str(i)]].sum().reset_index(), on =['gid','fsct_date'])
    
    for i in range(1, Forecate_months + 1):
        sim['M'+ str(i)] = sim['M'+ str(i)].round(4)

    sim.to_csv(f'Test_sim_{postfix}.csv', float_format='%.4f', index=None, mode='w')

In [7]:
def get_timespan(df, dt, minus, periods, freq ='1M'):
    return df[pd.date_range(dt - relativedelta(months=minus), periods=periods, freq=freq)]


def prepare_dataset(df, t_begin, is_train=True, name_prefix=None, target=2):
    X = {}
    
    for i in range(1, 13, 3):    
        curr =  get_timespan(df, t_begin, i, 1).values.ravel()
        X[f'AC_{i}'] = sm.tsa.acf(curr, nlags= (len(curr)-1))
            
    for i in range(1, 13):
        X[f'mon_{i}'] = get_timespan(df, t_begin, i, 1).values.ravel()
    
    for i in [ 3, 6, 9, 12]:
        tmp = get_timespan(df, t_begin, i, i)
        X[f'diff_{i}_mean'] = tmp.diff(axis=1).mean(axis=1).values 
        X[f'mean_{i}_decay'] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X[f'mean_{i}'] = tmp.mean(axis=1).values
        X[f'median_{i}'] = tmp.median(axis=1).values
        X[f'min_{i}'] = tmp.min(axis=1).values
        X[f'max_{i}'] = tmp.max(axis=1).values
        X[f'std_{i}'] = tmp.std(axis=1).values
        X[f'zero_{i}'] = (tmp == 0).astype(int).sum(axis=1)
        

    
    for i in [ 3, 6, 9, 12]:
        tmp = get_timespan(df, t_begin, i, i) 
        X[f'has_sales_days_in_last_{i}'] = (tmp > 0).sum(axis=1).values
        X[f'last_has_sales_day_in_last_{i}' ] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X[f'first_has_sales_day_in_last_{i}'] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    for i in range(3):
        X[f'mean_3M_sea_{i}'] = get_timespan(df, t_begin, 12-i, 4, freq='3M').mean(axis=1).values
        X[f'mean_6M_sea_{i}'] = get_timespan(df, t_begin, 12-i, 2, freq='6M').mean(axis=1).values
        X[f'mean_12M_sea_{i}'] = get_timespan(df, t_begin, 24-i, 2, freq='12M').mean(axis=1).values

    X = pd.DataFrame(X)
    gc.collect();   
    
    if is_train:
        y = df[pd.date_range(t_begin, periods = target, freq = '1M')].values
        return X, y
    
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

In [8]:
with timer('processing'):
    df, df2 = preprocess(basedf, truedf, base_car)
    
class TestMethods(unittest.TestCase):
    
    def test_isInstance(self):
        message = "given object is not instance of My definition."
        self.assertIsInstance(pd.DataFrame(), type(df), message)
        self.assertIsInstance(pd.DataFrame(), type(df2), message)
        
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)       

.

processing done in 0.609 sec
processing done in 0.010150444507598878 min



----------------------------------------------------------------------
Ran 1 test in 0.001s

OK


In [None]:
current_time = (datetime.now() + relativedelta(hours = 8)).strftime("%H:%M:%S")
print("Current Time =", current_time)

with timer('Training Total Execution time'):
    print("Preparing dataset...")

    gc.enable()
    df_concat = df2[base_var[1:]]

    interval = 1
    walk_target = Forecate_months
    
    t_begin = train_period

    frames_alpha = []
    score_mse = []
    for si in simulation:
        print("=" * 80)
        print(f"Simulation {str(si)}" )
        print("=" * 80)
        num_months = relativedelta(si, t_begin).years*12 + relativedelta(si, t_begin).months - walk_target
        t_val_begin = t_begin + relativedelta(months=(num_months)*interval ) 
        t_fcst_begin = t_val_begin + relativedelta(months= walk_target + preparation_period )

        X_l, y_l = [], []
        X_2, y_2 = [], []
        logging.basicConfig(filename='myprepare.log',  level=logging.DEBUG)
        logging.info('Started')
        for i in range(num_months + 1):
            delta = relativedelta(months=interval * i ) 
            X_tmp, y_tmp = prepare_dataset(df, t_begin + delta, target = walk_target)
            X_tmp = pd.concat([X_tmp, df_concat], join='inner', axis=1)
            X_l.append(X_tmp)
            y_l.append(y_tmp)
            
        for k in range(num_months + 1 + walk_target): 
            delta_bat = relativedelta(months=interval * k ) 
            X_batmp, y_batmp = prepare_dataset(df, t_begin + delta_bat, target = walk_target)
            X_batmp = pd.concat([X_batmp, df_concat], join='inner', axis=1)
            X_2.append(X_batmp)
            y_2.append(y_batmp)
            
        
        X_train = pd.concat(X_l, axis=0)
        y_train = np.concatenate(y_l, axis=0)
        print(f'Training set start date: {t_begin}')
        print(f'Training set   end date: {t_begin + delta}')

        X_val, y_val = prepare_dataset(df, t_val_begin, target = walk_target )
        X_val = pd.concat([X_val, df_concat], join='inner', axis=1)

        print(f'Validation set start date: {t_val_begin}')
        print(f'Validation set   end date: {t_val_begin  +  relativedelta(months=walk_target)}')

        X_test = prepare_dataset(df, t_fcst_begin, is_train=False, target = walk_target)
        X_test = pd.concat([X_test, df_concat], join='inner', axis=1)
        print(f'Forecasting set start date: {t_fcst_begin}')
        print(f'Forecasting set   end date: {t_fcst_begin  +  relativedelta(months=walk_target)}')
        
        X_bat = pd.concat(X_2, axis=0)
        y_bat = np.concatenate(y_2, axis=0)
        print(f'a-enet set start date: {t_begin}')
        print(f'a-enet set   end date: {t_begin + delta_bat }\n')
        assert X_train.shape[0] != X_bat.shape[0],"shape should Not be the same"
        print(X_train.shape,  X_bat.shape, X_val.shape)
        del X_l, y_l, X_2, y_2; gc.collect();
        logging.info('Finished')
        
        par = CFG()
        params = {
            'objective': par.obj,
            'max_depth': par.max_depth,
            'max_bin':par.max_bin,
            'tree_method': par.tree_method,
            'learning_rate': par.l_rate,
            'subsample': par.subsample,
            'colsample_bytree':par.col_sample,
            'lambda':par.l2,
            'eval_metric': par.eval_metric,
            'nthread': par.num_threads,
            'verbosity': par.verbose
        }
        

        val_pred = []
        test_alpha_pred = []
        cate_vars = base_car
        for walk in range(walk_target):
            print("=" * 80)
            print(f"Step {walk + 1}")
            print("=" * 80)
            
            early_stop  = xgb.callback.EarlyStopping(
                            rounds=par.early_stop,
                            metric_name=par.eval_metric,
                            maximize = False,
                            save_best = False,
                            min_delta = 1e-5,
                            data_name = "eval")
            
            dtrain = xgb.DMatrix(X_train, label=y_train[:, walk], missing=np.NaN, enable_categorical=True) 
            dval = xgb.DMatrix(X_val, label=y_val[:, walk], missing=np.NaN, enable_categorical=True)
            dtest = xgb.DMatrix(X_test, missing=np.NaN, enable_categorical=True)
            bst = xgb.train( params, dtrain, num_boost_round=par.num_boost_round,
                            evals=[(dtrain,'train'), (dval,'eval')], callbacks = [early_stop],
                            verbose_eval = par.verbose_eval)

            fea_imp = {k: v for k, v in sorted(bst.get_score(importance_type=par.importance_type).items(), key=lambda item: item[1], reverse=True)}
            print("\n".join((f"{k}: {np.round(v, 3)}" ) for ind,(k, v) in enumerate(fea_imp.items()) if ind < 9))

            val_pred.append(bst.predict(dval))
            del dtrain, dval, early_stop;
            gc.collect();
            
            with timer('aNet Inspiring'):
                print("aNet Inspiring & Prediction ...")
                dbatrain = xgb.DMatrix(X_bat, label=y_bat[:, walk], missing=np.NaN, enable_categorical=True) 
                battle = xgb.train( params, dbatrain, bst.best_iteration)
                test_alpha_pred.append(battle.predict(dtest))
            del dbatrain, bst, battle;
            gc.collect();
            
        if normalize:
            print(f"Validation mse : {mean_squared_error(np.expm1(y_val), np.expm1(np.array(val_pred).clip(0.).transpose()))}")
            print(f'SMAPE : {np.expm1(smape(y_val, np.array(val_pred).clip(0.).transpose()))}')
            print(f'F A   : {np.expm1(mape_val(y_val, np.array(val_pred).clip(0.).transpose()))}\n\n')
            score_mse.append(np.expm1(mape_val(y_val, np.array(val_pred).clip(0.).transpose())))
        else:
            print(f"Validation mse: {mean_squared_error( y_val, np.array(val_pred).clip(0.).transpose())}")
            print(f'SMAPE : {smape(y_val, np.array(val_pred).clip(0.).transpose())}')
            print(f'F A: {mape_val(y_val, np.array(val_pred).clip(0.).transpose())}\n\n')
            score_mse.append(mape_val(y_val, np.array(val_pred).clip(0.).transpose()))
            
        dd_alpha = post_pro(test_alpha_pred, df, t_val_begin, Forecate_months, walk_target, normalize)
        frames_alpha.append(dd_alpha)
        
    result_alpha = pd.concat(frames_alpha)
    
    print(f"Final MSE: {np.average(score_mse, axis=0)}")

with timer('Simulation'):
    simu(gid_map, result_alpha, postfix='xa')

Current Time = 12:35:37
Preparing dataset...
Simulation 2021-11-30 00:00:00
Training set start date: 2020-02-29 00:00:00
Training set   end date: 2021-08-29 00:00:00
Validation set start date: 2021-08-29 00:00:00
Validation set   end date: 2021-11-29 00:00:00
Forecasting set start date: 2022-01-29 00:00:00
Forecasting set   end date: 2022-04-29 00:00:00
a-enet set start date: 2020-02-29 00:00:00
a-enet set   end date: 2021-11-29 00:00:00

(14155, 74) (16390, 74) (745, 74)
Step 1
[0]	train-mphe:1.33329	eval-mphe:1.68842
[5000]	train-mphe:0.11438	eval-mphe:0.03154
