In [1]:
import pandas as pd
import numpy as np
from datetime import date, datetime, time, timedelta
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.metrics import mean_squared_error

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder


%config InlineBackend.figure_format = 'retina' 
%matplotlib inline

#models
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor

 
mpl.rcParams['lines.linewidth'] = 2

figsize=(12,9)

np.random.seed(238746)


import warnings
warnings.filterwarnings("ignore")

In [2]:
SplitTestDate = "2018-12-02"

In [3]:
models = {
    #"linear" : LinearRegression(),
    "xgb" : xgb.XGBRegressor(n_estimators=100, learning_rate=0.5),
    "forest" : RandomForestRegressor(n_estimators=60,random_state=1234),
    #"ada": AdaBoostRegressor(random_state=123, n_estimators=500)
    #"cat": CatBoostRegressor(iterations=500,learning_rate=0.5,eval_metric='MAPE')
}

features_for_model = {

    "xgb" : ['scaled_quarter','scaled_month','scaled_year','scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',
             'scaled_price_diff1','scaled_price',
             "BRAND2","BRAND4",
             'scaled_sales1','scaled_diff1','scaled_diff2','percentage_diff1',
             "scaled_rolling1","scaled_rolling2","scaled_rolling3","scaled_rolling4","scaled_rolling5",
             'scaled_promo'],
    "forest": ['scaled_quarter','scaled_month','scaled_year','scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',
             'scaled_price_diff1','scaled_price',
            "BRAND2","BRAND4",
             'scaled_sales1','scaled_sales2','scaled_diff1','scaled_diff2','percentage_diff1',
             "scaled_rolling2","scaled_rolling3",
             'scaled_promo'],
    "cat": [
             'scaled_price_diff1','scaled_price',
            "BRAND2","BRAND4",
             'scaled_sales1','scaled_sales2','scaled_sales3','scaled_diff1','scaled_diff2','percentage_diff1',
            ]
}

finalreg = xgb.XGBRegressor(n_estimators=500, learning_rate=0.5)

In [4]:
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=123)
min_max_scaler = preprocessing.MinMaxScaler()

def scale(feature, scaler = min_max_scaler):
    size = len(feature)
    return scaler.fit_transform(np.array([feature]).reshape(size, 1)).T[0]
def unscale(scaled, original, scaler = min_max_scaler):
    size2 = len(scaled)
    size1 = len(original)
    return scaler.fit(np.array([original]).reshape(size1, 1)).inverse_transform(np.array([scaled]).reshape(size2, 1)).T[0]

In [5]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def plot_results(ytrue, prediction, label):
    df_ytrue = pd.DataFrame(ytrue)
    df_ytrue['predicted'] =  prediction 
    plt.plot(df_ytrue[label],label='data')
    plt.plot(df_ytrue['predicted'],label='predicted')
    plt.xticks(rotation=45)
    mape = mean_absolute_percentage_error(df_ytrue[label],df_ytrue['predicted'])

    plt.title("Forecasting on Test Set MAPE=%.3f"%mape)
    plt.legend();
    plt.show()
    return mape

def scaled_mape(prediction, y_test, rescale):
    results = pd.DataFrame(columns = ["sku","target","prediction"])

    for sku in y_test["sku"].unique():

        pred_sku = prediction[np.where(y_test["sku"] == sku)]
        originals = rescale.loc[rescale["sku"] == sku]["target"]
        pred = unscale(pred_sku,originals)

        results = pd.concat([results, pd.DataFrame({
            "date" : y_test[y_test["sku"] == sku]["date"],
            "sku" : y_test[y_test["sku"] == sku]["sku"],
            "scaled_target" : y_test[y_test["sku"] == sku]["scaled_target"],
            "target" : y_test[y_test["sku"] == sku]["target"],
            "scaled_prediction" : pred_sku,
            "prediction" : pred
        })])
    results = results.dropna().set_index("date")
    return mean_absolute_percentage_error(results["target"],results["prediction"])

    
    
    
def plot_scaled_results(prediction, y_test, rescale):
    results = pd.DataFrame(columns = ["sku","target","prediction"])

    for sku in y_test["sku"].unique():

        pred_sku = prediction[np.where(y_test["sku"] == sku)]
        originals = rescale.loc[rescale["sku"] == sku]["target"]
        pred = unscale(pred_sku,originals)

        results = pd.concat([results, pd.DataFrame({
            "date" : y_test[y_test["sku"] == sku]["date"],
            "sku" : y_test[y_test["sku"] == sku]["sku"],
            "scaled_target" : y_test[y_test["sku"] == sku]["scaled_target"],
            "target" : y_test[y_test["sku"] == sku]["target"],
            "scaled_prediction" : pred_sku,
            "prediction" : pred
        })])
    results = results.dropna().set_index("date")

    return plot_results(results["target"], results["prediction"], "target"), results

In [6]:
def predict_training(model,X,y,label = "scaled_target",span = 24):
    
    #k = number of folds
    k = int(len(X_train)/span)
    
    prediction = pd.DataFrame(columns = ["date","sku","pred"])    
    
    for fold in range(0,k):
        if fold != k - 1:
            validation = X_train.iloc[fold*span:(fold+1)*span]
        else:
            validation = X_train.iloc[fold*span:]
        
        training = X_train[~X_train.index.isin(validation.index)]
        y_tr  = y_train[~y_train.index.isin(validation.index)]
        y_val  = y_train[y_train.index.isin(validation.index)]

        model.fit(training, y_tr[label])
        partial_pred = pd.DataFrame()
        partial_pred["date"] = y_val["date"]
        partial_pred["sku"] = y_val["sku"]
        partial_pred["pred"] = model.predict(validation)
        partial_pred.set_index(validation.index)
        prediction = prediction.append(partial_pred)
        
    return prediction

In [7]:
def StackingPred(models,final_model, X_train, y_train, X_test,y_test,features_for_model,rescale = None, span = 24, label = "scaled_target"):
    final_prediction = []
    for week in y_test.date.unique():
        print(week)
        test = X_test.loc[y_test.date == week]
        #TRAIN SET PREDICTION WITH MODELS
        results_dict = {}
        for name, model in models.items():
            X = X_train[features_for_model[name]]
            results_dict.update({name : predict_training(model, X, y_train,"scaled_target",span)})

        train_result = pd.DataFrame(index = y_train.index)
        for name,r in results_dict.items():
            train_result["date"] = r["date"]
            train_result["sku"] = r["sku"]
            train_result[name] = r["pred"]

        #TEST SET PREDICTION WITH MODELS
        prediction_df = pd.DataFrame()
        for name, model in models.items():
            X = X_train[features_for_model[name]]
            model = model.fit(X, y_train["scaled_target"]) 
            prediction_df[name] = model.predict(test[features_for_model[name]])
        prediction_df.index = test.index

        #DUMMY VARIABLES FOR FINAL TRAIN AND TEST
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoded = (pd.get_dummies(train_result["sku"]) > 0)
        onehot = pd.DataFrame(onehot_encoded, index = train_result.index)

        final_train = pd.concat([onehot,train_result[models.keys()]],axis = 1)

        onehot_encoded = pd.get_dummies(y_test["sku"]) > 0
        onehot = pd.DataFrame(onehot_encoded, index = y_test.index).iloc[test.index]
        final_test = pd.concat([onehot,prediction_df[models.keys()]],axis = 1)

        #TRAIN ON TRAIN PREDICTIONS AND PREDICTION ON TEST PREDICTIONS
        final_model.fit(final_train, y_train["scaled_target"])
        p =  final_model.predict(final_test)
        final_prediction = np.append(final_prediction,p)
        if rescale is not None:
            print("MAPE %.5f" % scaled_mape(final_prediction, y_test.loc[y_test.date == week], rescale))
        X_train = pd.concat([X_train,test])
        y_train = pd.concat([y_train,y_test.loc[y_test.date == week]])
    return final_prediction

In [8]:
df_train = pd.read_csv('processed_train.csv',index_col = 0)
df_final_test = pd.read_csv('processed_test.csv',index_col = 0)

In [9]:
df_final_test.date = df_final_test.date.apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))
df_train.date = df_train.date.apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))
#df_train = df_train.loc[df_train.scope == 1]
group1 = [688, 1058, 549, 546, 1027, 554, 1035, 1206, 1065]
group2 = [144, 686, 1051]
df_train = df_train.loc[df_train["sku"].isin(group2)]

In [10]:
rescale_df = pd.concat([df_train,df_final_test])[["target","sku"]].dropna()

In [11]:
simple_df = df_train[['scaled_quarter','scaled_month','scaled_year','scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',
             'scaled_price_diff1','scaled_price',
            "BRAND2","BRAND4",
             'scaled_sales1','scaled_sales2','scaled_sales3','scaled_diff1','scaled_diff2','percentage_diff1',
             "scaled_rolling1","scaled_rolling2","scaled_rolling3","scaled_rolling4","scaled_rolling5",
             'scaled_promo',
                     "date","sku","scaled_target","target"]]

In [12]:
simple_df = simple_df.dropna().sort_values(by=["date","sku"]).set_index("date")
df_train = simple_df[:SplitTestDate].reset_index()
df_test = simple_df[SplitTestDate:].reset_index()

In [13]:
df_test

Unnamed: 0,date,scaled_quarter,scaled_month,scaled_year,scaled_dayofyear,scaled_dayofmonth,scaled_weekofyear,scaled_price_diff1,scaled_price,BRAND2,...,percentage_diff1,scaled_rolling1,scaled_rolling2,scaled_rolling3,scaled_rolling4,scaled_rolling5,scaled_promo,sku,scaled_target,target
0,2018-12-08,1.000000,1.000000,0.666667,0.933518,0.233333,0.941176,-0.000000,0.174672,True,...,-0.347963,0.551038,0.442491,0.312314,0.248750,0.222983,0.997109,144,0.492325,67124.0
1,2018-12-08,1.000000,1.000000,0.666667,0.933518,0.233333,0.941176,-0.000000,0.116822,False,...,-0.290672,0.549143,0.459704,0.329330,0.265112,0.235700,1.000000,686,0.523888,81178.0
2,2018-12-08,1.000000,1.000000,0.666667,0.933518,0.233333,0.941176,-0.000000,0.116822,False,...,-0.312015,0.561546,0.459901,0.334009,0.271925,0.240194,1.000000,1051,0.555072,39461.0
3,2018-12-15,1.000000,1.000000,0.666667,0.952909,0.466667,0.960784,-0.165939,0.340611,True,...,0.103882,0.492325,0.521681,0.459103,0.357317,0.297465,0.985455,144,0.272244,40986.0
4,2018-12-15,1.000000,1.000000,0.666667,0.952909,0.466667,0.960784,-0.210280,0.327103,False,...,0.042795,0.523888,0.536516,0.481099,0.377970,0.316867,1.000000,686,0.247234,43122.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,2019-06-15,0.333333,0.454545,1.000000,0.445983,0.466667,0.450980,0.004673,0.822430,False,...,1.782033,0.097377,0.243170,0.495447,0.617814,0.624760,0.013526,686,0.096883,22440.0
83,2019-06-15,0.333333,0.454545,1.000000,0.445983,0.466667,0.450980,-0.004673,0.817757,False,...,1.479149,0.104440,0.248235,0.480095,0.587022,0.570971,0.000000,1051,0.116144,12610.0
84,2019-06-22,0.333333,0.454545,1.000000,0.465374,0.700000,0.470588,-0.000000,0.825328,True,...,-0.001997,0.066291,0.066152,0.169012,0.358033,0.476649,0.000000,144,0.046024,14119.0
85,2019-06-22,0.333333,0.454545,1.000000,0.465374,0.700000,0.470588,-0.004673,0.827103,False,...,0.003030,0.096883,0.097130,0.194408,0.395806,0.513628,0.008066,686,0.065936,18183.0


In [14]:
X_train = df_train[['scaled_quarter','scaled_month','scaled_year','scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',
             'scaled_price_diff1','scaled_price',
            "BRAND2","BRAND4",
             'scaled_sales1','scaled_sales2','scaled_sales3','scaled_diff1','scaled_diff2','percentage_diff1',
             "scaled_rolling1","scaled_rolling2","scaled_rolling3","scaled_rolling4","scaled_rolling5",
             'scaled_promo']]

In [15]:
y_train = df_train[["date","sku","scaled_target","target"]]

In [16]:
X_test = df_test[['scaled_quarter','scaled_month','scaled_year','scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',
             'scaled_price_diff1','scaled_price',
            "BRAND2","BRAND4",
             'scaled_sales1','scaled_sales2','scaled_sales3','scaled_diff1','scaled_diff2','percentage_diff1',
             "scaled_rolling1","scaled_rolling2","scaled_rolling3","scaled_rolling4","scaled_rolling5",
             'scaled_promo']]

In [17]:
y_test = df_test[["date","sku","scaled_target","target"]]

In [18]:
 y_test.loc[y_test.date == "2018-12-15"]

Unnamed: 0,date,sku,scaled_target,target
3,2018-12-15,144,0.272244,40986.0
4,2018-12-15,686,0.247234,43122.0
5,2018-12-15,1051,0.276637,22428.0


In [None]:
res = StackingPred(models,finalreg, X_train, y_train, X_test,y_test,features_for_model,rescale = rescale_df, span = 9, label = "scaled_target")


2018-12-08T00:00:00.000000000
MAPE 15.12635
2018-12-15T00:00:00.000000000
MAPE 102.06860
2018-12-22T00:00:00.000000000
MAPE 292.03043
2018-12-29T00:00:00.000000000
MAPE 615.32272
2019-01-05T00:00:00.000000000
MAPE 383.97156
2019-01-12T00:00:00.000000000
MAPE 15.61011
2019-01-19T00:00:00.000000000
MAPE 23.69227
2019-01-26T00:00:00.000000000
MAPE 28.02574
2019-02-02T00:00:00.000000000
MAPE 33.36502
2019-02-09T00:00:00.000000000
MAPE 211.16223
2019-02-16T00:00:00.000000000
MAPE 311.69210
2019-02-23T00:00:00.000000000
MAPE 413.08348
2019-03-02T00:00:00.000000000
MAPE 381.94972
2019-03-09T00:00:00.000000000
MAPE 45.77577
2019-03-16T00:00:00.000000000


In [None]:
#res = res.set_index(res["i"])
res

In [None]:
mape,results = plot_scaled_results(res, y_test, rescale_df)