##  M5 Forecasting-Accuracy(Estimate the unit sales of  Walmart retail goods) 


## Importing required packages

In [12]:
import pandas as pd
import numpy as np
from downcast import reduce
from tqdm import tqdm
import pickle
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

In [3]:
sales = pd.read_csv("sales_train_evaluation.csv") #sales_train_evaluation.csv contains data regarding daily sales of a particular product.
sales1 = pd.read_csv("sales_train_validation.csv") #sales_train_evaluation.csv contains data regarding daily sales of a particular product.
prices = pd.read_csv("sell_prices.csv") #sell_prices.csv contains data regarding selling price of a product for a particular week.
cal = pd.read_csv("calendar.csv") #calendar.csv contains data regarding each day events held and other basic details of the day.

## Function 1

In [16]:
def final1(test):
    
    '''
    This function is used to predict sales of a particular product for next 28 days.
    '''
    
    product_id = test['id']
    test = pd.DataFrame(test).T
    
    start_day = 1942
    end_day=1942+28
    
    for day in range(start_day,end_day):
        test['d_' + str(day)] = 0

    test=test.fillna(-1)
    # reframing the sales(sales_train_evaluation.csv) dataframe in such a way that it can be merged with other csv files.
    data = pd.melt(test, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],var_name='day', value_name='sales')
    # merging cal dataframe with data dataframe
    data = data.merge(cal, left_on='day', right_on='d')
    # merging prices dataframe with data dataframe
    data = data.merge(prices,on=['store_id','item_id', 'wm_yr_wk'], how='left')
    #mean imputation on sell_price column values
    data['sell_price'].fillna(data.groupby('id')['sell_price'].transform('mean'), inplace=True)
    # we are stripping the 'd_' from day column to make it an integer feature
    data['day'] = data['day'].apply(lambda x: x.split('_')[1]).astype(int)
    #since weekday's are represented as wday with numbers and d is a duplicate column.
    data.drop(['d','weekday','date'], axis=1, inplace=True)
    data=data.fillna(-1)
    
    # We need to use the same categorical labels as used while feature engineering the train data. 
    # So loading the dictionary which contains categories as key and its corresponding encoded value as item.
    cat_encoder = np.load('cat_encoder.npy',allow_pickle='TRUE').item()
    n1 = np.load('name1.npy',allow_pickle='TRUE').item()
    t1 = np.load('type1.npy',allow_pickle='TRUE').item()
    n2 = np.load('name2.npy',allow_pickle='TRUE').item()
    t2 = np.load('type2.npy',allow_pickle='TRUE').item()
    
    data = reduce(data)
    categorical_features=[]
    for i in data.columns:
        if str(data[i].dtype)=='category':
            categorical_features.append(str(i))     
    
    for i in categorical_features[:-4]:
        data[i]=data[i].map(cat_encoder)
        
    data[categorical_features[6]]=data[categorical_features[6]].map(n1)
    data[categorical_features[7]]=data[categorical_features[7]].map(t1)
    data[categorical_features[8]]=data[categorical_features[8]].map(n2)
    data[categorical_features[9]]=data[categorical_features[9]].map(t2)
    data=data.astype(int)
    
    #adding lag features
    lags = [28,30,35,42,49,56]
    for lag in lags:
        data["lag_" + str(lag)] = data.groupby("id")["sales"].shift(lag)

    data=data[(data['day']>=1914) & (data['day']<=1941)]

    datax = data.drop('sales',axis=1)
    datay = data['sales']
    #feature engineering for the given data is done
    # The best model we got is the stacking regression model with 8 base learners.
    
    #loading base models and meta models
    base_models =pickle.load(open('base_models.sav', 'rb'))
    meta_model =pickle.load(open('meta_model.sav', 'rb'))

    #getting prediction out of base models
    base_predictions = {}
    for i in range(len(base_models)):
        cname = 'M' + str(i)
        prediction = base_models[i].predict(datax)
        base_predictions[cname] = prediction
    base_test_predictions_df =  pd.DataFrame(base_predictions)
    
    ytest_pred = meta_model.predict(base_test_predictions_df)

    val_pred = np.reshape(ytest_pred, (-1, 28),order = 'F')
    val_pred=val_pred.T

    list_val=[]
    for i in val_pred:
        list_val.append(i[0])
    list_val.insert(0,product_id)

    submission = pd.read_csv("sample_submission.csv")
    cols=submission.columns

    result = pd.DataFrame(list_val)
    result=result.T
    result.columns = cols
    print("The forecasted sales for product with ID "+product_id+" from day 1942 to day 1969 (evaluation data) is as follows")
    return result

In [5]:
print(final1.__doc__)


    This function is used to predict sales of a particular product for next 28 days.
    


In [17]:
test1=sales.iloc[100]
final1(test1)

The forecasted sales for product with ID HOBBIES_1_105_CA_1_evaluation from day 1942 to day 1969 (evaluation data) is as follows


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_105_CA_1_evaluation,0.343445,0.237474,0.512945,0.266856,0.511792,0.989615,0.647443,0.527588,0.422948,...,0.438457,0.454321,0.786112,0.577783,0.443834,0.566946,0.266715,0.396021,0.512205,0.737277


In [18]:
test2=sales.iloc[2500]
final1(test2)

The forecasted sales for product with ID FOODS_3_276_CA_1_evaluation from day 1942 to day 1969 (evaluation data) is as follows


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_3_276_CA_1_evaluation,0.825102,0.488259,0.523475,0.65274,0.590019,1.529162,0.984523,0.841081,0.513084,...,0.606984,0.663384,0.666237,0.526014,0.484779,0.495336,0.512127,0.589653,0.68749,0.668121


## Function 2

In [10]:
def final2(test):
    '''
    This function is used to predict sales of a particular product for next 28 days and also displays the performance measure.
    ''' 
    product_id = test['id']
    test = pd.DataFrame(test).T
    start_day = 1914
    end_day=1914+28
    
    for day in range(start_day,end_day):
        test['d_' + str(day)] = 0

    test=test.fillna(-1)
    # reframing the sales(sales_train_evaluation.csv) dataframe in such a way that it can be merged with other csv files.
    data = pd.melt(test, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],var_name='day', value_name='sales')
    # merging cal dataframe with data dataframe
    data = data.merge(cal, left_on='day', right_on='d')
    # merging prices dataframe with data dataframe
    data = data.merge(prices,on=['store_id','item_id', 'wm_yr_wk'], how='left')
    #mean imputation on sell_price column values
    data['sell_price'].fillna(data.groupby('id')['sell_price'].transform('mean'), inplace=True)
    # we are stripping the 'd_' from day column to make it an integer feature
    data['day'] = data['day'].apply(lambda x: x.split('_')[1]).astype(int)
    #since weekday's are represented as wday with numbers and d is a duplicate column.
    data.drop(['d','weekday','date'], axis=1, inplace=True)
    data=data.fillna(-1)
    
    # We need to use the same categorical labels as used while feature engineering the train data. 
    # So loading the dictionary which contains categories as key and its corresponding encoded value as item.
    id_encoder = np.load('id_encoder.npy',allow_pickle='TRUE').item()
    cat_encoder = np.load('cat_encoder.npy',allow_pickle='TRUE').item()
    n1 = np.load('name1.npy',allow_pickle='TRUE').item()
    t1 = np.load('type1.npy',allow_pickle='TRUE').item()
    n2 = np.load('name2.npy',allow_pickle='TRUE').item()
    t2 = np.load('type2.npy',allow_pickle='TRUE').item()
    data = reduce(data)
    categorical_features=[]
    for i in data.columns:
        if str(data[i].dtype)=='category':
            categorical_features.append(str(i))     
    
    for i in categorical_features[1:-4]:
        data[i]=data[i].map(cat_encoder)
     
    data[categorical_features[0]]=data[categorical_features[0]].map(id_encoder)
    data[categorical_features[6]]=data[categorical_features[6]].map(n1)
    data[categorical_features[7]]=data[categorical_features[7]].map(t1)
    data[categorical_features[8]]=data[categorical_features[8]].map(n2)
    data[categorical_features[9]]=data[categorical_features[9]].map(t2)
    data=data.astype(int)
    
    #adding lag features
    lags = [28,30,35,42,49,56]
    for lag in lags:
        data["lag_" + str(lag)] = data.groupby("id")["sales"].shift(lag)

    data=data[(data['day']>=1914) & (data['day']<=1941)]

    datax = data.drop('sales',axis=1)
    datay = data['sales']
    #feature engineering for the given data is done
    # The best model we got is the stacking regression model with 8 base learners.
    
    #loading base models and meta models
    base_models =pickle.load(open('base_models.sav', 'rb'))
    meta_model =pickle.load(open('meta_model.sav', 'rb'))

    #getting prediction out of base models
    base_predictions = {}
    for i in range(len(base_models)):
        cname = 'M' + str(i)
        prediction = base_models[i].predict(datax)
        base_predictions[cname] = prediction
    base_test_predictions_df =  pd.DataFrame(base_predictions)
    
    ytest_pred = meta_model.predict(base_test_predictions_df)
    mse1 = mean_squared_error(datay, ytest_pred)
    print("RMSE Score is ",mse1)
    print("-----------------------------------")
    print("The forecasted sales for product with id "+product_id+" from day 1914 to day 1941 (validation data) is as follows")
    val_pred = np.reshape(ytest_pred, (-1, 28),order = 'F')
    val_pred=val_pred.T

    list_val=[]
    for i in val_pred:
        list_val.append(i[0])
    list_val.insert(0,product_id)

    submission = pd.read_csv("sample_submission.csv")
    cols=submission.columns

    result = pd.DataFrame(list_val)
    result=result.T
    result.columns = cols
    return result

In [8]:
print(final2.__doc__)


    This function is used to predict sales of a particular product for next 28 days and also displays the performance measure.
    


In [13]:
test3=sales1.iloc[100]
final2(test3)

RMSE Score is  0.26095789577981676
-----------------------------------
The forecasted sales for product with id HOBBIES_1_105_CA_1_validation from day 1914 to day 1941 (validation data) is as follows


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_105_CA_1_validation,0.343445,0.237474,0.512945,0.266856,0.511792,0.989615,0.647443,0.527588,0.422948,...,0.438457,0.454321,0.786112,0.577783,0.443834,0.566946,0.266715,0.396021,0.512205,0.737277


In [14]:
test4=sales1.iloc[2500]
final2(test4)

RMSE Score is  0.45646288639149896
-----------------------------------
The forecasted sales for product with id FOODS_3_276_CA_1_validation from day 1914 to day 1941 (validation data) is as follows


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_3_276_CA_1_validation,0.825102,0.488259,0.523475,0.65274,0.590019,1.529162,0.984523,0.841081,0.513084,...,0.606984,0.663384,0.666237,0.526014,0.484779,0.495336,0.512127,0.589653,0.68749,0.668121
