# Evaluate the model on the `test set`

In [1]:
from joblib import load
from pathlib import Path
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Load the test set

In [8]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_test.csv"
X_y_test = load_ds(dataset_path, filename)

print(f"Shape: {X_y_test.shape}")

Shape: (99476, 18)


In [23]:
X_test = X_y_test # include "Sales", CombinedAttributesAdder() drops it
y_test = X_y_test.loc[:, "Sales"].copy()

# Pipeline

In [10]:

# TransformerMixin: add method ".fit_transform()"
# BaseEstimator: add methods ".get_params()" and ".set_params()"
# We need 3 methods:
# 1) .fit()
# 2) .transform()
# 3) .fit_transform() (provided by "TransformerMixin")
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # avoid "*args" or "**kargs" in "__init__"
    def __init__(self):
        pass

    # fit is needed later for the pipilene
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Since I use MEAN ENCODING, "X" must include
        # the terget variable. Below, just before returning
        # the transformed X, the target variable is dropped.

        # Date
        Date_2 = pd.to_datetime(X["Date"], format="%Y-%m-%d")
        X["Month"] = Date_2.dt.month
        X = mean_encode(X, "Month", "Sales")
        # drop: "Date" and "Month"

        # Store
        X = mean_encode(X, "Store", "Sales")
        # drop: "Store"

        # DayOfWeek
        X = mean_encode(X, "DayOfWeek", "Sales")
        # drop: "DayOfWeek"

        # Promo (separately for each Store)
        X = mean_encode_2(X, "Promo", "Store", "Sales")
        # drop: "Promo" and "Store"

        # SchoolHoliday
        X.loc[X.SchoolHoliday=="0", :] = 0.0
        # keep: "SchoolHoliday"

        # StoreType: keep, no transformation

        # Assortment: keep, no transformation

        # Promo2: keep, no transformation

        # CompetitionDistance
        nb = 10 # number of bins
        clip_upper = 10000
        X["CD_clip"] = X["CompetitionDistance"].clip(upper=clip_upper)
        CD_clip_bins = pd.cut(
            X["CD_clip"],
            bins=nb,
            labels=[i for i in range(nb)])
        X['CD_clip_bins'] = pd.to_numeric(CD_clip_bins)
        X["CD_clip_bins_clip"] = X["CD_clip_bins"].clip(upper=clip_upper) # 
        # drop: "CompetitionDistance", "CD_clip", "CD_clip_bins"

        # Drop unused columns
        cols_to_drop = [
            "Date", "Month", "Store", "DayOfWeek", "Customers", "Open", "Promo",
            "StateHoliday", "CompetitionDistance", "CD_clip", "CD_clip_bins",
            "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2SinceWeek",
            "Promo2SinceYear", "PromoInterval"]
        X.drop(columns=cols_to_drop, inplace=True)

        # Drop the target
        target_to_drop = ["Sales"]
        X.drop(columns=target_to_drop, inplace=True)

        return X

# Load the model

In [20]:
mod = load("models/GridSearch_2")

In [18]:
import pickle

# open a file, where you stored the pickled data
file = open("models/GridSearch_2", 'rb')

# dump information to that file
mod = pickle.load(file)

# close the file
file.close()

# Make predictions

In [21]:
mod.prediction(X_train)

AttributeError: 'GridSearchCV' object has no attribute 'prediction'

In [24]:
mod.best_estimator_.predict(X_test)

NameError: name 'mean_encode' is not defined