# Evaluate the model on the `test set`

In [1]:
from joblib import load
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

Enter here the name of the file with the `test set` (the file must be in the folder `data`)

In [2]:
filename = "train.csv"

# Load the test set

In [3]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
X_y_test = load_ds(dataset_path, filename)

print(f"Shape: {X_y_test.shape}")

Shape: (637774, 9)


  return pd.read_csv(path / filename)


In [4]:
store = pd.read_csv('data/store.csv')

In [5]:
df = X_y_test.merge(store, on='Store')
df['Date'] = pd.to_datetime(df['Date'])

In [6]:
df.dropna(inplace=True, subset=['Promo', 'StateHoliday', 'SchoolHoliday', 'DayOfWeek'])
df = df.loc[df['Sales'] > 0] # Only keep days when sales happened
df.drop(columns=['Customers', 'Open'], inplace=True)
df.shape

(440048, 16)

In [7]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [8]:
X_test = df.drop(["Sales"], axis=1)
y_test = df.loc[:, "Sales"].copy()

print(f"shape X_train: {X_test.shape}")
print(f"shape y_train: {y_test.shape}")

shape X_train: (440048, 17)
shape y_train: (440048,)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440048 entries, 1 to 618470
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Date                       440048 non-null  datetime64[ns]
 1   Store                      440048 non-null  float64       
 2   DayOfWeek                  440048 non-null  float64       
 3   Sales                      440048 non-null  float64       
 4   Promo                      440048 non-null  float64       
 5   StateHoliday               440048 non-null  object        
 6   SchoolHoliday              440048 non-null  float64       
 7   StoreType                  440048 non-null  object        
 8   Assortment                 440048 non-null  object        
 9   CompetitionDistance        438903 non-null  float64       
 10  CompetitionOpenSinceMonth  300254 non-null  float64       
 11  CompetitionOpenSinceYear   300254 non-null  float64      

# Pipeline

In [10]:
class MultipleMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_arrays):
        self.columns_arrays = columns_arrays
        self.means = []

    def fit(self, X, y):
        X = X.merge(y, left_index=True, right_index=True)
        for cols in self.columns_arrays:
            mean = X.groupby(cols, dropna=False)['Sales'].mean().rename(''.join(cols) + 'Mean')
            self.means.append(mean)
        return self
    
    def transform(self, X):
        X = X.copy()
        for i, cols in enumerate(self.columns_arrays):
            X = X.merge(self.means[i], on=cols)
        return X

class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.means = {}

    def fit(self, X, y):
        X = X.merge(y, left_index=True, right_index=True)
        for col in self.columns:
            self.means[col] = X.groupby(col, dropna=False)['Sales'].mean().rename(col + 'Mean')
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X = X.merge(self.means[col], on=col)
        return X

class ColumnSelection(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns)

# Load the model

In [11]:
mod = load("models/final_model")

In [12]:
y_pred = mod.predict(X_test)

In [13]:
y_pred

array([7046.60034115, 7046.60034115, 7046.60034115, ..., 6434.05996083,
       6552.85607336, 6106.47552753])

In [14]:
np.array(y_test)

array([3697., 4297., 4540., ..., 3335., 4724., 5398.])

In [15]:
def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [16]:
metric(y_pred, np.array(y_test))

64.42929981686805