<h1><b>M5 Forecasting: First Cut</b></h1>



# **Contents**

<h3> 1. Load and Extract Data </h3> 

<h3> 2. Reading the Data </h3> 

<h3> 3. Data Wrangling</h3>  

<h3> 4. Define functions to compute weights and metrics</h3> 

<h3> 5. Simple Models</h3> 

<h3> 6. Summary</h3> 

<h3> 7. References</h3> 

In [None]:
import pandas as pd
from zipfile import ZipFile
import numpy as np
import seaborn as sns
from tqdm import tqdm
import random 
import datetime
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

# 1. Load and Extract Data 

## 1.1 Loading Data from Kaggle

In [None]:
!wget --header="Host: storage.googleapis.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: https://www.kaggle.com/" "https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/18599/1236839/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1664974595&Signature=LOGBjPsrRoXOCJPNBYXooVsLidYPTmAYClAvftoysnvphQq%2B39DtWKZCxAQm2ThWzCR7jFjxwtp23lCxb9nhE7s5mckbr6AJRsymVZFBVSSgh1D%2BiDa8JqQs%2BaCHDHP6Ne8aTit50VxfmDwOyr1N5%2BfThjdzBjXGuRAgpawK9Gi8HNkoZI6DnVcMF9grIQN4ETRtXSkouZ%2FsKjy35tODy1gpewEYTpPq62HUFrIK80aik%2FlEqVUGol4KQATmxfGsA71RpirRrYsc8GNiQxHZQUJ9zCMHDdPTTxWJPC4bRNkQpVGmVqdhOiYssr9eqxaFWGQeb5H%2BxUT2zpxkJZ%2B9Jw%3D%3D&response-content-disposition=attachment%3B+filename%3Dm5-forecasting-accuracy.zip" -c -O 'm5-forecasting-accuracy.zip'

--2022-10-02 12:57:05--  https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/18599/1236839/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1664974595&Signature=LOGBjPsrRoXOCJPNBYXooVsLidYPTmAYClAvftoysnvphQq%2B39DtWKZCxAQm2ThWzCR7jFjxwtp23lCxb9nhE7s5mckbr6AJRsymVZFBVSSgh1D%2BiDa8JqQs%2BaCHDHP6Ne8aTit50VxfmDwOyr1N5%2BfThjdzBjXGuRAgpawK9Gi8HNkoZI6DnVcMF9grIQN4ETRtXSkouZ%2FsKjy35tODy1gpewEYTpPq62HUFrIK80aik%2FlEqVUGol4KQATmxfGsA71RpirRrYsc8GNiQxHZQUJ9zCMHDdPTTxWJPC4bRNkQpVGmVqdhOiYssr9eqxaFWGQeb5H%2BxUT2zpxkJZ%2B9Jw%3D%3D&response-content-disposition=attachment%3B+filename%3Dm5-forecasting-accuracy.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.128, 2607:f8b0:4023:c0d::80
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 48009163 (46M) [application/zip]
Saving to: ‘m5-forecasting-accuracy.zip’


2022-10-02 12

## 1.2 Extracting Data from Zip

In [None]:
with ZipFile('/content/m5-forecasting-accuracy.zip') as z: 
    z.extractall()

# 2. Reading the Data

In [None]:
df_sales = pd.read_csv('/content/sales_train_evaluation.csv')
df_price = pd.read_csv('/content/sell_prices.csv')
df_cal = pd.read_csv('/content/calendar.csv', parse_dates=['date'])

In [None]:
sales = df_sales.copy() 

# 3. Data Wrangling

In [None]:
df_cal["d_"]=df_cal["d"].apply(lambda x: int(x.split("_")[1]))
df_price["id"] = df_price["item_id"] + "_" + df_price["store_id"] + "_evaluation"

# 4. Define function to calculate weights and metrics


## 4.1 Compute Weights

In [None]:
level_groupings = {2: ["state_id"], 3: ["store_id"], 4: ["cat_id"], 5: ["dept_id"], 6: ["state_id", "cat_id"], 7: ["state_id", "dept_id"],
                   8: ["store_id", "cat_id"], 9: ["store_id", "dept_id"], 10: ["item_id"], 11: ["item_id", "state_id"]}

def calculate_weightsL12(sales, cal, price, last28):
    ''' Calculate weights for level 12 (Product-Store) series' using the last 28 days sales data '''

    #calculating weights for level 12 : 'item_id, store_id' using last 28 days of train data
    #this loop is repeated 28 times to get the sales revenue of all ids for each of last 28 days 
    #flow of execution : day-> week id-> sell price of ids on the day-> sales revenue of ids for the day
    for day in tqdm(range(last28[0], last28[1])):
        #get the week id corresponding to the day 
        week_id = int(cal[cal["d_"]==day]["wm_yr_wk"]) 

        #get the week price for each of the items corresponding to the week id   
        week_price = price[price["wm_yr_wk"]==week_id]

        #merge sales with week price on 'id'
        #note: we merge the dataframes using inner join so the id which are present in both dataframes will be retained after merging
        sales = sales.merge(week_price[["sell_price", "id"]], on=["id"], how='inner')

        #create a column which shows the sales revenue for the day
        #sales revenue = sell_price * units_sold 
        sales["sales_revenue_d_" + str(day)] = sales["sell_price"] * sales["d_" + str(day)]

        #drop the sell_price column
        sales.drop(columns=["sell_price"], inplace=True)

    #Sum of sales revenue of each id for last 28 days   1`
    sales_revenue_cols = [x for x in sales.columns if x.find("sales_revenue")==0]
    sales['sales_revenue_alldays'] = sales[sales_revenue_cols].sum(axis=1)

    #Compute weights for each Level 12 Time Series
    sales['weight'] = (1/12)*(sales['sales_revenue_alldays']/sales['sales_revenue_alldays'].sum())

    #Drop the unnecessary columns 
    sales.drop(columns = sales_revenue_cols+['sales_revenue_alldays'], inplace=True)

    return sales

def calculate_weightsALL(sales, levels):
    ''' Calculate weights for series' in rest of the aggregation levels '''
    #weights for level 1 : 'all'
    agg = pd.DataFrame(sales[[x for x in sales.columns if x.find("d_") == 0 or x.find("F_") == 0]].sum()).transpose() 
    id_cols = ["item_id", "dept_id", "cat_id", "store_id", "state_id"]
    for col in id_cols:
        agg[col] = 'all'
    agg["level"] = 1
    agg["weight"] = 1/12
    column_order = agg.columns

    #weights for the rest of the levels (levels 2-11)
    for level in tqdm(level_groupings):
        temp_df = sales.groupby(by=level_groupings[level]).sum().reset_index()
        temp_df["level"] = level
        
        for c in column_order:
            if c not in temp_df.columns:
                temp_df[c] = 'all'
                
        agg = agg.append(temp_df[column_order])

    return agg

## 4.2 Compute WRMSSE

In [None]:
def get_day_splits(sales, train_start, train_end, val_start, val_end):
    '''Create lists of days/columns to be selected for train, val and forecast when calculating RMSSE'''
    train_days =  [x for x in sales.columns if x.find("d_") == 0 and int(x.split("_")[1]) in range(train_start, train_end+1)] 
    val_days =  [x for x in sales.columns if x.find("d_") == 0 and int(x.split("_")[1]) in range(val_start, val_end+1)]
    forecast_days = [x for x in sales.columns if x.find("F_") == 0]
    return train_days, val_days, forecast_days

def RMSSE(ground_truth, forecast, train, n, h):
    ''' Calculates the RMSSE score for all series in the dataframe. '''

    num = ((ground_truth - forecast)**2).sum(axis=1)
    den = 1/(n-1) * ((train[:, 1:] - train[:, :-1]) ** 2).sum(axis=1)  
    rmsse = (1/h * num/den) ** 0.5

    return rmsse

In [None]:
def WRMSSE(sales, agg, train_days, val_days, forecast_days, n, h):
    ''' Calculates the WRMSSE score for the model prediction '''
    
    ground_truth_df = np.array(sales[val_days])
    forecast_df = np.array(sales[forecast_days])
    train_df = np.array(sales[train_days])

    ground_truth_agg_df = np.array(agg[val_days])
    forecast_agg_df = np.array(agg[forecast_days])
    train_agg_df = np.array(agg[train_days])
            
    sales["rmsse"] = RMSSE(ground_truth_df, forecast_df, train_df, n, h)
    agg["rmsse"] = RMSSE(ground_truth_agg_df, forecast_agg_df, train_agg_df, n, h)

    sales["wrmsse"] = sales["weight"] * sales["rmsse"]
    agg["wrmsse"] = agg["weight"] * agg["rmsse"]

    wrmsse = sales["wrmsse"].sum() + agg["wrmsse"].sum()
    
    return wrmsse

In [None]:
#create list of encoded columns 
encoded_cols = pd.get_dummies(df_sales[["dept_id", "cat_id", "store_id", "state_id"]]).columns.tolist()

In [None]:
#perform categorical encoding on some columns using pandas and join the resulting dataframe with original 
df_sales = df_sales[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]].join(pd.get_dummies(df_sales.drop(columns=["id", "item_id"])))
df_sales.head(1)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3,state_id_CA,state_id_TX,state_id_WI
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# 5. Simple Models

1. ExtraTrees Regressor
2. DecisionTrees Regressor
3. RandomForest Regressor

- **Function to evaluate a model instance**

In [None]:
def evaluate_model_instance(model, train_start, train_end, df, df_c, df_p, levels, return_dfs=False):
    ''' Computes the WRMSSE score of the model instance to understand if the model needs to be tuned further or not. '''
    
    #e.g. train_start = 1200, train_end = 1857 and we are training a regressor which returns a multi-labelled output 
    #X_train contains data from days/columns (1200, 1857), y_train contains data from next 28 days/columns (1858, 1885)
    #X_val contains data from days/columns (1228, 1885), y_pred contains data from next 28 days/columns (1886, 1913)
    #we calculate the WRMSSE score from the prediction on the validation data
    #based on the score returned from this function we tune the hyperparameters of the model

    #select days/columns to create X_train and y_train 
    trainDays_x = [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) in range(train_start, train_end + 1)]
    trainDays_y = [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) in range(train_end + 1, train_end + 28 + 1)] 
    X_train = df[trainDays_x + encoded_cols].values
    y_train = df[trainDays_y].values
    
    #fit the model on train data
    model.fit(X_train, y_train)
    
    #select days/columns to create X_val
    valDays_x = [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) in range(train_start + 28, train_end + 28 + 1)] 
    X_val = df[valDays_x + encoded_cols].values

    #input the validation set to the model and generate predictions dataframe 
    #rename the columns of the predications dataframe to
    y_pred = pd.DataFrame(model.predict(X_val))
    y_pred.columns = [f"F_{d}" for d in range(train_end + 28 + 1, train_end + 28 + 28 + 1)] 

    #concatenate the predicted data with original dataframe 
    df = pd.concat([df, y_pred], axis=1)

    #calculate weights for all levels
    print('calculating weights')
    df = calculate_weightsL12(df, df_c, df_p, (train_end+1, train_end + 28 + 1))
    agg_df = calculate_weightsALL(df, levels)

    #compute WRMSSE score 
    train_days, val_days, forecast_days = get_day_splits(df, train_start, train_end + 28, train_end + 28 + 1, train_end + 28 + 28)
    wrmsse = WRMSSE(df, agg_df, train_days, val_days, forecast_days,  n = ((train_end + 28 + 1) - train_start), h = 28)

    #if True below condition returns the concatenated dataframe, predictions dataframe and WRMSSE score 
    #otherwise only WRMSSE score is returned
    if return_dfs:
        return df, y_pred, wrmsse
    
    return wrmsse

## 5.1 ExtraTrees Regressor

In [None]:
train_start = 1200
train_end = 1857

n_est_list = []
max_depth_list = []
wrmsse_scores = []

for _ in tqdm(range(3)):
    rand_n_est = random.randint(20, 50)
    rand_max_depth = random.randint(10, 30)

    regressor = ExtraTreesRegressor(n_estimators=rand_n_est, max_depth=rand_max_depth, random_state=42)

    wrmsse = evaluate_model_instance(regressor, train_start, train_end, df_sales, df_cal, df_price, level_groupings)
    
    n_est_list.append(rand_n_est)
    max_depth_list.append(rand_max_depth)
    wrmsse_scores.append(wrmsse)

  0%|          | 0/3 [00:00<?, ?it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:23,  1.17it/s][A
  7%|▋         | 2/28 [00:01<00:13,  1.96it/s][A
 11%|█         | 3/28 [00:01<00:09,  2.52it/s][A
 14%|█▍        | 4/28 [00:01<00:08,  2.89it/s][A
 18%|█▊        | 5/28 [00:01<00:07,  3.11it/s][A
 21%|██▏       | 6/28 [00:02<00:06,  3.27it/s][A
 25%|██▌       | 7/28 [00:02<00:06,  3.40it/s][A
 29%|██▊       | 8/28 [00:02<00:05,  3.49it/s][A
 32%|███▏      | 9/28 [00:03<00:05,  3.55it/s][A
 36%|███▌      | 10/28 [00:03<00:04,  3.65it/s][A
 39%|███▉      | 11/28 [00:03<00:04,  3.68it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.69it/s][A
 46%|████▋     | 13/28 [00:04<00:04,  3.72it/s][A
 50%|█████     | 14/28 [00:04<00:03,  3.75it/s][A
 54%|█████▎    | 15/28 [00:04<00:03,  3.79it/s][A
 57%|█████▋    | 16/28 [00:04<00:03,  3.74it/s][A
 61%|██████    | 17/28 [00:05<00:02,  3.74it/s][A
 64%|██████▍   | 18/28 [00:05<00:02,  3.79it/s][A
 68%|██████▊   | 19/28 [00:05<00:02,  3.80it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:06,  3.91it/s][A
  7%|▋         | 2/28 [00:00<00:06,  3.74it/s][A
 11%|█         | 3/28 [00:00<00:06,  3.76it/s][A
 14%|█▍        | 4/28 [00:01<00:06,  3.74it/s][A
 18%|█▊        | 5/28 [00:01<00:06,  3.79it/s][A
 21%|██▏       | 6/28 [00:01<00:05,  3.80it/s][A
 25%|██▌       | 7/28 [00:01<00:05,  3.83it/s][A
 29%|██▊       | 8/28 [00:02<00:05,  3.75it/s][A
 32%|███▏      | 9/28 [00:02<00:05,  3.77it/s][A
 36%|███▌      | 10/28 [00:02<00:04,  3.76it/s][A
 39%|███▉      | 11/28 [00:02<00:04,  3.73it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.73it/s][A
 46%|████▋     | 13/28 [00:03<00:04,  3.73it/s][A
 50%|█████     | 14/28 [00:03<00:03,  3.72it/s][A
 54%|█████▎    | 15/28 [00:03<00:03,  3.74it/s][A
 57%|█████▋    | 16/28 [00:04<00:03,  3.70it/s][A
 61%|██████    | 17/28 [00:04<00:02,  3.70it/s][A
 64%|██████▍   | 18/28 [00:04<00:02,  3.74it/s][A
 68%|██████▊   | 19/28 [00:05<00:02,  3.72it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:06,  3.87it/s][A
  7%|▋         | 2/28 [00:00<00:06,  3.85it/s][A
 11%|█         | 3/28 [00:00<00:06,  3.89it/s][A
 14%|█▍        | 4/28 [00:01<00:06,  3.87it/s][A
 18%|█▊        | 5/28 [00:01<00:06,  3.81it/s][A
 21%|██▏       | 6/28 [00:01<00:05,  3.78it/s][A
 25%|██▌       | 7/28 [00:01<00:05,  3.80it/s][A
 29%|██▊       | 8/28 [00:02<00:05,  3.82it/s][A
 32%|███▏      | 9/28 [00:02<00:05,  3.78it/s][A
 36%|███▌      | 10/28 [00:02<00:04,  3.79it/s][A
 39%|███▉      | 11/28 [00:02<00:04,  3.81it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.81it/s][A
 46%|████▋     | 13/28 [00:03<00:03,  3.78it/s][A
 50%|█████     | 14/28 [00:03<00:03,  3.77it/s][A
 54%|█████▎    | 15/28 [00:03<00:03,  3.78it/s][A
 57%|█████▋    | 16/28 [00:04<00:03,  3.76it/s][A
 61%|██████    | 17/28 [00:04<00:02,  3.75it/s][A
 64%|██████▍   | 18/28 [00:04<00:02,  3.74it/s][A
 68%|██████▊   | 19/28 [00:05<00:02,  3.75it/s]

In [None]:
best_n_est = n_est_list[np.argmin(wrmsse_scores)]
best_max_depth = max_depth_list[np.argmin(wrmsse_scores)]
lowest_wrmsse = wrmsse_scores[np.argmin(wrmsse_scores)]
print('Lowest WRMSSE=', lowest_wrmsse)
print('Lowest WRMSSE achieved on max_depth=',best_max_depth)
print('Lowest WRMSSE achieved on n_estimators=',best_n_est)

Lowest WRMSSE= 0.7018430102743918
Lowest WRMSSE achieved on max_depth= 28
Lowest WRMSSE achieved on n_estimators= 26


In [None]:
train_start = 1200
train_end = 1885

best_ETregressor = ExtraTreesRegressor(n_estimators=best_n_est, max_depth=best_max_depth)
concat_ETdf, preds_ETdf, wrmsse_ET = evaluate_model_instance(regressor, train_start, train_end, df_sales, df_cal, df_price, level_groupings, return_dfs=True)

print("WRMSSE Score (ExtraTrees):", wrmsse_ET)

calculating weights


100%|██████████| 28/28 [00:07<00:00,  3.78it/s]
100%|██████████| 10/10 [00:07<00:00,  1.35it/s]


WRMSSE Score (ExtraTrees): 0.78616539276217


  del sys.path[0]


## 5.2 DecisionTrees Regressor

In [None]:
train_start = 1200
train_end = 1857

n_est_list = []
max_depth_list = []
wrmsse_scores = []

for _ in tqdm(range(3)):
    rand_max_depth = random.randint(10, 30)

    regressor = DecisionTreeRegressor(max_depth=rand_max_depth, random_state=42)

    wrmsse = evaluate_model_instance(regressor, train_start, train_end, df_sales, df_cal, df_price, level_groupings)
    
    n_est_list.append(rand_n_est)
    max_depth_list.append(rand_max_depth)
    wrmsse_scores.append(wrmsse)

  0%|          | 0/3 [00:00<?, ?it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:07,  3.62it/s][A
  7%|▋         | 2/28 [00:00<00:07,  3.68it/s][A
 11%|█         | 3/28 [00:00<00:06,  3.64it/s][A
 14%|█▍        | 4/28 [00:01<00:06,  3.66it/s][A
 18%|█▊        | 5/28 [00:01<00:06,  3.67it/s][A
 21%|██▏       | 6/28 [00:01<00:05,  3.69it/s][A
 25%|██▌       | 7/28 [00:01<00:05,  3.65it/s][A
 29%|██▊       | 8/28 [00:02<00:05,  3.60it/s][A
 32%|███▏      | 9/28 [00:02<00:05,  3.64it/s][A
 36%|███▌      | 10/28 [00:02<00:04,  3.63it/s][A
 39%|███▉      | 11/28 [00:03<00:04,  3.65it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.64it/s][A
 46%|████▋     | 13/28 [00:03<00:04,  3.65it/s][A
 50%|█████     | 14/28 [00:03<00:03,  3.63it/s][A
 54%|█████▎    | 15/28 [00:04<00:03,  3.60it/s][A
 57%|█████▋    | 16/28 [00:04<00:03,  3.63it/s][A
 61%|██████    | 17/28 [00:04<00:03,  3.65it/s][A
 64%|██████▍   | 18/28 [00:04<00:02,  3.55it/s][A
 68%|██████▊   | 19/28 [00:05<00:02,  3.55it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:06,  3.90it/s][A
  7%|▋         | 2/28 [00:00<00:06,  3.79it/s][A
 11%|█         | 3/28 [00:00<00:06,  3.66it/s][A
 14%|█▍        | 4/28 [00:01<00:06,  3.63it/s][A
 18%|█▊        | 5/28 [00:01<00:06,  3.64it/s][A
 21%|██▏       | 6/28 [00:01<00:06,  3.60it/s][A
 25%|██▌       | 7/28 [00:01<00:05,  3.59it/s][A
 29%|██▊       | 8/28 [00:02<00:05,  3.63it/s][A
 32%|███▏      | 9/28 [00:02<00:05,  3.62it/s][A
 36%|███▌      | 10/28 [00:02<00:04,  3.62it/s][A
 39%|███▉      | 11/28 [00:03<00:04,  3.59it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.61it/s][A
 46%|████▋     | 13/28 [00:03<00:04,  3.61it/s][A
 50%|█████     | 14/28 [00:03<00:03,  3.60it/s][A
 54%|█████▎    | 15/28 [00:04<00:03,  3.60it/s][A
 57%|█████▋    | 16/28 [00:04<00:03,  3.61it/s][A
 61%|██████    | 17/28 [00:04<00:03,  3.60it/s][A
 64%|██████▍   | 18/28 [00:04<00:02,  3.59it/s][A
 68%|██████▊   | 19/28 [00:05<00:02,  3.57it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:06,  3.89it/s][A
  7%|▋         | 2/28 [00:00<00:06,  3.78it/s][A
 11%|█         | 3/28 [00:00<00:06,  3.71it/s][A
 14%|█▍        | 4/28 [00:01<00:06,  3.69it/s][A
 18%|█▊        | 5/28 [00:01<00:06,  3.68it/s][A
 21%|██▏       | 6/28 [00:01<00:05,  3.69it/s][A
 25%|██▌       | 7/28 [00:01<00:05,  3.66it/s][A
 29%|██▊       | 8/28 [00:02<00:05,  3.63it/s][A
 32%|███▏      | 9/28 [00:02<00:05,  3.61it/s][A
 36%|███▌      | 10/28 [00:02<00:05,  3.59it/s][A
 39%|███▉      | 11/28 [00:03<00:04,  3.56it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.58it/s][A
 46%|████▋     | 13/28 [00:03<00:04,  3.58it/s][A
 50%|█████     | 14/28 [00:03<00:03,  3.60it/s][A
 54%|█████▎    | 15/28 [00:04<00:03,  3.56it/s][A
 57%|█████▋    | 16/28 [00:04<00:03,  3.55it/s][A
 61%|██████    | 17/28 [00:04<00:03,  3.58it/s][A
 64%|██████▍   | 18/28 [00:04<00:02,  3.57it/s][A
 68%|██████▊   | 19/28 [00:05<00:02,  3.58it/s]

In [None]:
best_max_depth = max_depth_list[np.argmin(wrmsse_scores)]
lowest_wrmsse = wrmsse_scores[np.argmin(wrmsse_scores)]
print('Lowest WRMSSE=', lowest_wrmsse)
print('Lowest WRMSSE achieved on max_depth=',best_max_depth)

Lowest WRMSSE= 0.7705162271756554
Lowest WRMSSE achieved on max_depth= 14


In [None]:
train_start = 1200
train_end = 1885

best_DTregressor = DecisionTreeRegressor(max_depth=best_max_depth)
concat_DTdf, preds_DTdf, wrmsse_DT = evaluate_model_instance(regressor, train_start, train_end, df_sales, df_cal, df_price, level_groupings, return_dfs=True)

print("WRMSSE Score (DecisionTrees):", wrmsse_DT)

calculating weights


100%|██████████| 28/28 [00:07<00:00,  3.51it/s]
100%|██████████| 10/10 [00:08<00:00,  1.21it/s]


WRMSSE Score (DecisionTrees): 0.8541675327917617


  del sys.path[0]


## 5.3 RandomForest Regressor 

In [None]:
train_start = 1200
train_end = 1857

n_est_list = []
max_depth_list = []
wrmsse_scores = []

for _ in tqdm(range(3)):
    rand_n_est = random.randint(20, 50)
    rand_max_depth = random.randint(10, 30)

    regressor = RandomForestRegressor(n_estimators=rand_n_est, max_depth=rand_max_depth, random_state=42)

    wrmsse = evaluate_model_instance(regressor, train_start, train_end, df_sales, df_cal, df_price, level_groupings)
    
    n_est_list.append(rand_n_est)
    max_depth_list.append(rand_max_depth)
    wrmsse_scores.append(wrmsse)

  0%|          | 0/3 [00:00<?, ?it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:16,  1.67it/s][A
  7%|▋         | 2/28 [00:00<00:10,  2.36it/s][A
 11%|█         | 3/28 [00:01<00:09,  2.73it/s][A
 14%|█▍        | 4/28 [00:01<00:08,  2.94it/s][A
 18%|█▊        | 5/28 [00:01<00:07,  3.09it/s][A
 21%|██▏       | 6/28 [00:02<00:06,  3.17it/s][A
 25%|██▌       | 7/28 [00:02<00:06,  3.19it/s][A
 29%|██▊       | 8/28 [00:02<00:06,  3.23it/s][A
 32%|███▏      | 9/28 [00:03<00:05,  3.25it/s][A
 36%|███▌      | 10/28 [00:03<00:05,  3.25it/s][A
 39%|███▉      | 11/28 [00:03<00:05,  3.25it/s][A
 43%|████▎     | 12/28 [00:03<00:04,  3.26it/s][A
 46%|████▋     | 13/28 [00:04<00:04,  3.28it/s][A
 50%|█████     | 14/28 [00:04<00:04,  3.30it/s][A
 54%|█████▎    | 15/28 [00:04<00:03,  3.34it/s][A
 57%|█████▋    | 16/28 [00:05<00:03,  3.35it/s][A
 61%|██████    | 17/28 [00:05<00:03,  3.33it/s][A
 64%|██████▍   | 18/28 [00:05<00:03,  3.32it/s][A
 68%|██████▊   | 19/28 [00:06<00:02,  3.33it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:01<00:40,  1.51s/it][A
  7%|▋         | 2/28 [00:01<00:22,  1.17it/s][A
 11%|█         | 3/28 [00:02<00:15,  1.61it/s][A
 14%|█▍        | 4/28 [00:02<00:12,  1.95it/s][A
 18%|█▊        | 5/28 [00:02<00:10,  2.21it/s][A
 21%|██▏       | 6/28 [00:03<00:09,  2.40it/s][A
 25%|██▌       | 7/28 [00:03<00:08,  2.55it/s][A
 29%|██▊       | 8/28 [00:03<00:07,  2.67it/s][A
 32%|███▏      | 9/28 [00:04<00:06,  2.73it/s][A
 36%|███▌      | 10/28 [00:04<00:06,  2.79it/s][A
 39%|███▉      | 11/28 [00:04<00:05,  2.86it/s][A
 43%|████▎     | 12/28 [00:05<00:05,  2.88it/s][A
 46%|████▋     | 13/28 [00:05<00:05,  2.90it/s][A
 50%|█████     | 14/28 [00:06<00:04,  2.92it/s][A
 54%|█████▎    | 15/28 [00:06<00:04,  2.89it/s][A
 57%|█████▋    | 16/28 [00:06<00:04,  2.88it/s][A
 61%|██████    | 17/28 [00:07<00:03,  2.89it/s][A
 64%|██████▍   | 18/28 [00:07<00:03,  2.90it/s][A
 68%|██████▊   | 19/28 [00:07<00:03,  2.90it/s]

calculating weights



  0%|          | 0/28 [00:00<?, ?it/s][A
  4%|▎         | 1/28 [00:00<00:09,  2.86it/s][A
  7%|▋         | 2/28 [00:00<00:08,  2.93it/s][A
 11%|█         | 3/28 [00:01<00:08,  2.98it/s][A
 14%|█▍        | 4/28 [00:01<00:08,  2.93it/s][A
 18%|█▊        | 5/28 [00:01<00:07,  2.93it/s][A
 21%|██▏       | 6/28 [00:02<00:07,  2.96it/s][A
 25%|██▌       | 7/28 [00:02<00:07,  2.95it/s][A
 29%|██▊       | 8/28 [00:02<00:06,  2.90it/s][A
 32%|███▏      | 9/28 [00:03<00:06,  2.92it/s][A
 36%|███▌      | 10/28 [00:03<00:06,  2.91it/s][A
 39%|███▉      | 11/28 [00:03<00:05,  2.93it/s][A
 43%|████▎     | 12/28 [00:04<00:05,  2.90it/s][A
 46%|████▋     | 13/28 [00:04<00:05,  2.90it/s][A
 50%|█████     | 14/28 [00:04<00:04,  2.88it/s][A
 54%|█████▎    | 15/28 [00:05<00:04,  2.88it/s][A
 57%|█████▋    | 16/28 [00:05<00:04,  2.89it/s][A
 61%|██████    | 17/28 [00:05<00:03,  2.87it/s][A
 64%|██████▍   | 18/28 [00:06<00:03,  2.88it/s][A
 68%|██████▊   | 19/28 [00:06<00:03,  2.85it/s]

In [None]:
best_n_est = n_est_list[np.argmin(wrmsse_scores)]
best_max_depth = max_depth_list[np.argmin(wrmsse_scores)]
lowest_wrmsse = wrmsse_scores[np.argmin(wrmsse_scores)]
print('Lowest WRMSSE=', lowest_wrmsse)
print('Lowest WRMSSE achieved on max_depth=',best_max_depth)
print('Lowest WRMSSE achieved on n_estimators=',best_n_est)

Lowest WRMSSE= 0.7107109606288111
Lowest WRMSSE achieved on max_depth= 29
Lowest WRMSSE achieved on n_estimators= 46


In [None]:
train_start = 1200
train_end = 1885

best_RFregressor = RandomForestRegressor(n_estimators=best_n_est, max_depth=best_max_depth)
concat_RFdf, preds_RFdf, wrmsse_RF = evaluate_model_instance(regressor, train_start, train_end, df_sales, df_cal, df_price, level_groupings, return_dfs=True)

print("WRMSSE Score (RandomForest):", wrmsse_RF)

calculating weights


100%|██████████| 28/28 [00:09<00:00,  2.95it/s]
100%|██████████| 10/10 [00:13<00:00,  1.32s/it]


WRMSSE Score (RandomForest): 0.8438456480026252


  del sys.path[0]


# 6. Summary

In [None]:
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ["Model", "WRMSSE"]
x.add_rows(
    [
        ["ExtraTrees", 0.78],
        ["DecisionTrees", 0.85],
        ["RandomForest", 0.84],
    ]
)

print(x)

+---------------+--------+
|     Model     | WRMSSE |
+---------------+--------+
|   ExtraTrees  |  0.78  |
| DecisionTrees |  0.85  |
|  RandomForest |  0.84  |
+---------------+--------+


Clearly, ExtraTrees Model outperforms the other two. 

# 7. References

- code reference - https://youtu.be/tMtqHZk6zUE