# Credits to

[1] https://www.kaggle.com/mayer79/m5-forecast-keras-with-categorical-embeddings-v2 

[2] https://www.kaggle.com/ragnar123/very-fst-model

[3] https://www.kaggle.com/mayer79/m5-forecast-poisson-loss

[4] https://www.kaggle.com/lnovakovi/copy-m5-darker-magic-difflgbm

# Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import gc
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
import time
import pickle
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_poisson_deviance

import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

# Reduce Memory Usage 

Run this function on dataframes to reduce the memory usage.

In [None]:
'''
Got from: [1]
'''
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Upload data

In [None]:

'''
Got from: [1]
'''
'''
path = "../input/m5-forecasting-accuracy"

calendar = pd.read_csv(os.path.join(path, "calendar.csv"))
selling_prices = pd.read_csv(os.path.join(path, "sell_prices.csv"))
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sales = pd.read_csv(os.path.join(path, "sales_train_validation.csv"))
'''

# Helper functions from [4]

* Get data by store - the function in which we upload our dataframe with features 
* Get base test - the function that returns preprocessed test set-  it has all the features that author of [4] created (rolling means 7 ... 60 I think, lag features, mean encoding etc). We can always drop the features we don't want to use 


In [None]:
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df


# Read data
def get_data_by_store(store):
    
    # Read and contact basic features here, just related to e.g. calendar and
    # prices 
    
    #The guy had 4 pickle files of features, first load simple ones
    
    #Here we can load our features what we have
    '''df = pd.concat([pd.read_pickle(ID),
                    pd.read_pickle(INITIAL).iloc[:,2:],
                    pd.read_pickle(PART2).iloc[:,2:]],
                   
                    axis=1)
    '''
    # In our case we have sales as melted dataframe, and we leave relevant part 
    
    df=sales
    df = df[df['store_id']==store]
    
    
    '''With memory limits we have to read 
    lags and mean encoding features
    separately and drop items that we don't need.
    As our Features Grids are aligned 
    we can use index to keep only necessary rows
    Alignment is good for us as concat uses less memory than merge. '''
    
    # We add lag and rolling features seperately and leave only one related
    # to indexes we have, so only ones for current STORE ID 
    
    '''
    df2 = pd.read_pickle(ROLLING)
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit '''
    
    # Note: Here we can also add functions to dynamically make some new features 
    
    # Create features list 
    # note: feature list is defined in the cell with the rest of the global variables
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    #print(df)
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()
    for store_id in STORES_IDS:
        #temp_df = pd.read_pickle('../input/m5-aux-models/test_'+store_id+'.pkl')
        temp_df = pd.read_pickle('test_'+str(store_id)+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
        
    return base_test

Help functions from [4] for making features dynamically

In [None]:
# makes dynamically features (used in the prediction part I think
# but if we don't have pickle files we can use them in training I think as well)

def make_lag(LAG_DAY):
    #TARGET="sales"
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    #TARGET="sales"
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

# Model parameteres

In [None]:
## model LGBM+Fourier+Features3(Cyril'sBest)+Categorical
'''
import lightgbm as lgb
lgb_params = {
                    'metric': 'rmse',
                    'objective': 'poisson',
                    'seed': 200,
                    'force_row_wise' : True,
                    'learning_rate' : 0.01,
                    'lambda': 0.1,
                    'num_leaves': 200,
                    'sub_row' : 0.7,
                    'bagging_freq' : 1,
                    'colsample_bytree': 0.77
    }

'''

# taken from [4]
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.031,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

# Create file to store encoders

In [None]:
encoder_file = open('encoders', 'wb')

# Prepare Dataframes

This is everything the same as we used before

In [None]:
'''
Got from: [1]
- The columns "Date" and "Weekday" are dropped as they contain redundant information.
- Normally, the column "d" is like "d_1,d_2,...". Make it "1,2,.." and the type integer
- If there is no event (I think), there is NA. We will replace them with "NoEvent" string. Originally, it was replaced with
  "missing", but I don't think it makes sense as I don't think there is missing information, I think they just left 
  the days without any event as NA.
- We enumerate most of the columns:
    - We do not enumerate "d" and "wm_yr_weak" because we will use these columns for joins.
    - Why do we enumerate month and day? I think it is because they start from 1, not 0.
    - Originally, the binary columns "snap_X" were also enumerated. I don't think it is necessary. The only neccessary step
      was to convert their type from int64 to int as it uses less space; but reduce_mem_usage will take care of that.
- I would suggest saving the OrdinalEncoder in case we need to reverse the transformations
'''
#def prep_calendar(df,encoder_file):
#    df = df.drop(["date", "weekday"], axis=1)  
#    df = df.assign(d = df.d.str[2:].astype(int))
#    df = df.fillna("NoEvent")
#    cols = list(set(df.columns) - {"wm_yr_wk", "d"}) 
#    oe = OrdinalEncoder(dtype="int")
#    df[cols] = oe.fit_transform(df[cols])
#    pickle.dump(oe,encoder_file)
#    df = reduce_mem_usage(df)
#    return df
#
#calendar = prep_calendar(calendar,encoder_file)

'''
Got from: [1]
Originally, there were features added in this part. I excluded them until we decide whether to use those or not.
'''
#def prep_selling_prices(df):
#    df = reduce_mem_usage(df)
#    return df
#
#selling_prices = prep_selling_prices(selling_prices)


'''
Got from: [1]
- We drop the first "drop_d" days. Originally, this is set to 1000. When it is set to this value,
  the shape we get 29,544,810 rows. When we don't set it, we get 60,034,810 rows. I think for now 
  we can keep this functionality, as it may be useful if we would like to discard some of the days.
- In some id's, we have "_validation". Those are deleted.
- reindex: Conform DataFrame to new index with optional filling logic (obtained from pandas doc). 
  We add days 1914+2*28 to prepare data from submission
- We have to melt the sales dataframe since days are contained as columns.
- assign: Returns a new object with all original columns in addition to new ones. Existing columns 
  that are re-assigned will be overwritten (obtained from pandas doc). Again, we make the values 
  "d_1, d-2,..." to "1,2,..."
'''
##We have to melt sales for sure because the days are columns, which is not desirable.
#def reshape_sales(df, drop_d = None):
#    if drop_d is not None:
#        df = df.drop(["d_" + str(i + 1) for i in range(drop_d)], axis=1)
#    #df = df.assign(id=df.id.str.replace("_validation", ""))
#    df = df.reindex(columns=df.columns.tolist() + ["d_" + str(1913 + i + 1) for i in range(2 * 28)])
#    df = df.melt(id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
#                 var_name='d', value_name='demand')
#    df = df.assign(d=df.d.str[2:].astype("int16"))
#    return df
#
#sales = reshape_sales(sales)

# Merge dataframes

In [None]:
'''
Got from: [1]
- Merge all the dataframes and delete the unnecessary ones
- time.sleep() added to make sure garbage collector finishes its job before the next merge
'''
#sales = sales.merge(calendar, how="left", on="d")
#del calendar
#gc.collect()
#time.sleep(5)
#sales = sales.merge(selling_prices, how="left", on=["wm_yr_wk", "store_id", "item_id"])
#del selling_prices
#sales.drop(["wm_yr_wk"], axis=1, inplace=True)
#gc.collect()
#time.sleep(5)
#sales.head()
#
#sales = reduce_mem_usage(sales)

# Feature Engineering

We will add features in the begining as pickle files. The idea is from [4]. This helps also solve memory issues. 

Read pickle files

In [None]:
initial_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                    'clusters', 'd', 'demand', 'wday', 'month', 'year', 'event_name_1',
                    'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
                    'snap_WI', 'daytype', 'sell_price', 'sell_price_rel_diff']

rolling_windows = ['lag_t7','rolling_mean_lag7_w7', 'rolling_mean_lag7_w14',
                   'rolling_mean_lag7_w28', 'rolling_mean_lag7_w60',
                   'rolling_mean_lag7_w90', 'lag_t28', 'rolling_mean_lag28_w7',
                   'rolling_mean_lag28_w14', 'rolling_mean_lag28_w28',
                   'rolling_mean_lag28_w60', 'rolling_mean_lag28_w90']

simple_fe_features = ['price_max', 'price_min',
                      'price_std', 'price_mean', 'price_norm', 'price_nunique',
                      'item_nunique']

In [None]:
sales = pd.DataFrame()

ids = []
with (open('../input/clgfeatures/id column', "rb")) as openfile:
    while True:
        try:
            ids.append(pickle.load(openfile))
        except EOFError:
            break
            
ids = [item for sublist in ids for item in sublist]
sales["id"]=ids         
del ids
gc.collect()
print("added ids")

init_f = []
with (open('../input/clgfeatures/initial features', "rb")) as openfile:
    while True:
        try:
            init_f.append(pickle.load(openfile))
        except EOFError:
            break
            
for idx,feat in enumerate(initial_features):
    sales[feat]=init_f[idx]
    
del init_f, initial_features
sales= reduce_mem_usage(sales)
gc.collect()
   
print("added initial features" )    



In [None]:
part = []
with (open('../input/clgfeatures/SimpleFE1', "rb")) as openfile:
    while True:
        try:
            part.append(pickle.load(openfile))
        except EOFError:
            break
            
for idx,p in enumerate(simple_fe_features):
    sales[p]=part[idx]
    
del part, simple_fe_features
sales= reduce_mem_usage(sales)
gc.collect()

print("added simple fe")

roll = []
with (open('../input/clgfeatures/rolling windows', "rb")) as openfile:
    while True:
        try:
            roll.append(pickle.load(openfile))
        except EOFError:
            break
            
for idx,rol in enumerate(rolling_windows):
    sales[rol]=roll[idx]
    
del roll, rolling_windows
sales= reduce_mem_usage(sales)
gc.collect()
print("add roll")

In [None]:
simple_fe_features2 = ['price_momentum', 'price_momentum_m','price_momentum_y']
f2= []
with (open('../input/clgfeatures/SimpleFE2', "rb")) as openfile:
    while True:
        try:
            f2.append(pickle.load(openfile))
        except EOFError:
            break
            
for idx,fe in enumerate(simple_fe_features2):
    sales[fe]=f2[idx]
    
del f2, simple_fe_features2
sales= reduce_mem_usage(sales)
gc.collect()
print("add simple f2")

In [None]:

icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        ]
TARGET='demand'

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    sales['enc'+col_name+'mean'] = sales.groupby(col)[TARGET].transform('mean').astype(np.float16)
    sales['enc'+col_name+'std'] = sales.groupby(col)[TARGET].transform('std').astype(np.float16)

In [None]:
sales.head(50)

# Global variables

Our target is demand, but in his features the target was sales

Taken from [4]

**Note**: if we want to use mean encoding features, then we need to copy mean_features list from the Dark Magic notebook

In [None]:
#################################################################################
#       defining some global variables , [4]



VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores

#LIMITS and const
TARGET      = 'demand'           # Our target (was sales)
START_TRAIN = 0             # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False               # Use or not pretrained models


# AUX(pretrained) Models paths
#AUX_MODELS = '../input/m5-aux-models/'

# These features are the ones that lead to overfit and the ones that 
# are not in the test set 
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d','wday', 'year', 'month',TARGET]

#STORES ids
ORIGINAL = '../input/m5-forecasting-accuracy/'
#STORES_IDS =  pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
#STORES_IDS = list(STORES_IDS.unique())


ID = '../input/clgfeatures/id column'
INITIAL ='../input/clgfeatures/initial features'
PART2 ='../input/clgfeatures/part2'
ROLLING = '../input/clgfeatures/rolling windows'


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [None]:
STORES_IDS = list(sales['store_id'].unique())

In [None]:
# Train models


for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    '''#make features for that store_id data
    grid_df, features_columns = make_rolling_fe(grid_df) 
    grid_df= impute_Na(grid_df)
    '''
    print("made grid")
    

    # train_mask (All data less than 1913)
    # valid_mask (Last 28 days - not real validatio set)
    # preds_mask (All data greater than 1913 day, 
    #       with some gap for recursive features (100))
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
   
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    # Saving the part into the picke file 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+str(store_id)+'.pkl')
    del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )

    lgb.plot_importance(estimator, importance_type="gain", precision=0, height=0.5, figsize=(6, 10));
    # Save model - it's not real '.bin' but a pickle file
    
    # Why to use pickle file? 
    
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+str(store_id)+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

In [None]:

## taken from [4]
'''
ORIGINAL = '../input/m5-forecasting-accuracy/'

submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(VER)+'.csv', index=False)
'''