# Evaluate the model on the `test set`

In [1]:
from joblib import load
from pathlib import Path
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import GridSearchCV

Enter here the name of the file with the `test set` (the file must be in the folder `data`)

In [2]:
filename = "train.csv"

# Load the test set

In [3]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
X_y_test = load_ds(dataset_path, filename)

print(f"Shape: {X_y_test.shape}")

Shape: (637774, 9)


  return pd.read_csv(path / filename)


In [4]:
store = pd.read_csv('data/store.csv')

In [5]:
X_y_test = X_y_test.merge(store, on="Store")

In [6]:
X_y_test = X_y_test.loc[~X_y_test.Sales.isna(), :]
X_y_test = X_y_test.loc[X_y_test.Sales != 0, :]

In [7]:
X_test = X_y_test.drop(["Sales"], axis=1)
y_test = X_y_test.loc[:, "Sales"].copy()

print(f"shape X_train: {X_test.shape}")
print(f"shape y_train: {y_test.shape}")

shape X_train: (497376, 17)
shape y_train: (497376,)


# Pipeline

In [8]:

# TransformerMixin: add method ".fit_transform()"
# BaseEstimator: add methods ".get_params()" and ".set_params()"
# We need 3 methods:
# 1) .fit()
# 2) .transform()
# 3) .fit_transform() (provided by "TransformerMixin")
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # avoid "*args" or "**kargs" in "__init__"
    def __init__(self):
        self.mean_Month = pd.DataFrame()
        self.mean_Store = pd.DataFrame()
        self.mean_DayOfWeek = pd.DataFrame()
        self.mean_Promo_Store = pd.DataFrame()

    # fit is needed later for the pipilene
    def fit(self, X, y=None):
        # X["target_var"] = y

        # Date
        #self.means[col] = X.groupby(col, dropna=False)['Sales'].mean().rename(col + 'Mean')
        Date_2 = pd.to_datetime(X["Date"], format="%Y-%m-%d")
        X["Month"] = Date_2.dt.month
        #print(X.columns)
        self.mean_Month = self.mean_encode(X, "Month", "Sales")
        #
        ## self.mean_Month = (
        ##     # select columns
        ##     X.loc[:, ["Month", "Sales"]]
        ##     # group by feature
        ##     .groupby("Month")
        ##     # aggregate over feature using target mean
        ##     .agg(Month_mean=("Sales", np.mean))
        ##     # index (i.e., feature categories) as a column
        ##     .reset_index()
        ##     # rename the column with the aggregated means
        ##     #.rename(columns={"tmp_name":new_col_name})
        ## )
        #print(self.mean_Month)

        # Store
        self.mean_Store = self.mean_encode(X, "Store", "Sales")

        # DayOfWeek
        self.mean_DayOfWeek = self.mean_encode(X, "DayOfWeek", "Sales")

        # Promo (separately for each Store)
        self.mean_Promo_Store = self.mean_encode_2(X, "Promo", "Store", "Sales")

        # Drop the target
        #target_to_drop = ["Sales"]
        #X.drop(columns=target_to_drop, inplace=True)

        return self
    
    def transform(self, X):
        # Since I use MEAN ENCODING, "X" must include
        # the terget variable. Below, just before returning
        # the transformed X, the target variable is dropped.

        # Date
        Date_2 = pd.to_datetime(X["Date"], format="%Y-%m-%d")
        X["Month"] = Date_2.dt.month
        X = pd.merge(X, self.mean_Month, how="left", on="Month")
        # drop: "Date" and "Month"

        # Store
        #X = self.mean_encode(X, "Store", "Sales")
        X = pd.merge(X, self.mean_Store, how="left", on="Store")
        # drop: "Store"

        # DayOfWeek
        # X = self.mean_encode(X, "DayOfWeek", "Sales")
        X = pd.merge(X, self.mean_DayOfWeek, how="left", on="DayOfWeek")
        # drop: "DayOfWeek"

        # Promo (separately for each Store)
        # X = self.mean_encode_2(X, "Promo", "Store", "Sales")
        X = pd.merge(X, self.mean_Promo_Store, how="left", on=["Promo", "Store"])
        # drop: "Promo" and "Store"

        # SchoolHoliday
        X.loc[X.SchoolHoliday=="0", :] = 0.0
        # keep: "SchoolHoliday"

        # StoreType: keep, no transformation

        # Assortment: keep, no transformation

        # Promo2: keep, no transformation

        # CompetitionDistance
        nb = 10 # number of bins
        clip_upper = 10000
        X["CD_clip"] = X["CompetitionDistance"].clip(upper=clip_upper)
        CD_clip_bins = pd.cut(
            X["CD_clip"],
            bins=nb,
            labels=[i for i in range(nb)])
        X['CD_clip_bins'] = pd.to_numeric(CD_clip_bins)
        X["CD_clip_bins_clip"] = X["CD_clip_bins"].clip(upper=clip_upper) # 
        # drop: "CompetitionDistance", "CD_clip", "CD_clip_bins"

        # Drop unused columns
        cols_to_drop = [
            "Date", "Month", "Store", "DayOfWeek", "Customers", "Open", "Promo",
            "StateHoliday", "CompetitionDistance", "CD_clip", "CD_clip_bins",
            "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2SinceWeek",
            "Promo2SinceYear", "PromoInterval"]
        X.drop(columns=cols_to_drop, inplace=True)

        # Drop the target
        with_target = sum([col == "Sales" for col in X.columns])
        if with_target > 0:
            target_to_drop = ["Sales"]
            X.drop(columns=target_to_drop, inplace=True)

        return X
    

    def mean_encode(self, df: pd.DataFrame, feature: str, target: str):
        """
        df: dataframe with "feature" and "target" columns
        feature: feature to transform
        target: target variable
        """
        new_col_name = feature + "_mean"
        df_enc = (
            # select columns
            df.loc[:, [feature, target]]
            # group by feature
            .groupby(feature)
            # aggregate over feature using target mean
            .agg(tmp_name=(target, np.mean))
            # index (i.e., feature categories) as a column
            .reset_index()
            # rename the column with the aggregated means
            .rename(columns={"tmp_name":new_col_name})
        )
    
        # merge: add the new column with the aggregated mean from
        # "df_enc" back into "df"
        # df_merged = pd.merge(df, df_enc, how="left", on=feature)
    
        return df_enc # df_merged
        
        
    def mean_encode_2(self, df: pd.DataFrame, feature1: str, feature2: str, target: str):
        """
        Same as "mean_encode" but with 2 features.
        df: dataframe with "feature" and "target" columns
        feature: feature to transform
        target: target variable
        """
        new_col_name = feature1 + feature2 + "_mean"
        df_enc = (
            # select columns
            df.loc[:, [feature1, feature2, target]]
            # group by feature
            .groupby([feature1, feature2])
            # aggregate over feature using target mean
            .agg(tmp_name = (target, np.mean))
            # index (i.e., feature categories) as a column
            .reset_index()
            # rename the column with the aggregated means
            .rename(columns={"tmp_name":new_col_name})
            )
    
        # merge: add the new column with the aggregated mean from
        # "df_enc" back into "df"
        # df_merged = pd.merge(df, df_enc, how="left", on=[feature1, feature2])
    
        return df_enc # df_merged

     

# Load the model

In [9]:
mod = load("models/random_forest_final")

# Make predictions

In [10]:
y_pred = mod.predict(X_test)

In [11]:
def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [12]:
metric(y_pred, np.array(y_test))

23.251948432052366