In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
store = pd.read_csv('data/store.csv')
train = pd.read_csv('data/train.csv')

  train = pd.read_csv('data/train.csv')


In [14]:

df = train.merge(store, on='Store')
df['Date'] = pd.to_datetime(df['Date'])
if 'Sales' in df.columns:
    df = df.loc[df['Sales'] > 0] # Only keep days when sales happened

In [15]:
df.dropna(inplace=True, subset=['Sales', 'Promo', 'StateHoliday', 'SchoolHoliday', 'DayOfWeek'])
df.drop(columns=['Customers', 'Open'], inplace=True)
df.shape

(440048, 16)

In [16]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Sales', axis=1),
    df['Sales'],
    test_size=0.2,
    random_state=42
)

In [18]:
df.isna().sum()

Date                              0
Store                             0
DayOfWeek                         0
Sales                             0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
StoreType                         0
Assortment                        0
CompetitionDistance            1145
CompetitionOpenSinceMonth    139794
CompetitionOpenSinceYear     139794
Promo2                            0
Promo2SinceWeek              217001
Promo2SinceYear              217001
PromoInterval                217001
Year                              0
Month                             0
dtype: int64

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class MultipleMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_arrays):
        self.columns_arrays = columns_arrays
        self.means = []

    def fit(self, X, y):
        X = X.merge(y, left_index=True, right_index=True)
        for cols in self.columns_arrays:
            mean = X.groupby(cols, dropna=False)['Sales'].mean().rename(''.join(cols) + 'Mean')
            self.means.append(mean)
        return self
    
    def transform(self, X):
        X = X.copy()
        for i, cols in enumerate(self.columns_arrays):
            X = X.merge(self.means[i], on=cols)
        return X

class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.means = {}

    def fit(self, X, y):
        X = X.merge(y, left_index=True, right_index=True)
        for col in self.columns:
            self.means[col] = X.groupby(col, dropna=False)['Sales'].mean().rename(col + 'Mean')
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X = X.merge(self.means[col], on=col)
        return X

class ColumnSelection(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns)

In [20]:
from sklearn.ensemble import RandomForestRegressor

mean_columns = [
    'DayOfWeek',
    'Month',
    # 'Year',
    # 'Assortment',
    'StoreType',
    'StateHoliday',
    # 'SchoolHoliday',
    # 'Promo2',
    # 'CompetitionOpenSinceMonth',
    # 'CompetitionOpenSinceYear',
]
multiple_mean_columns = [['Promo', 'Store']]
multiple_mean_columns_flatten = [col for arr in multiple_mean_columns for col in arr]
all_columns = mean_columns + multiple_mean_columns_flatten

preprocessor = Pipeline([
    ('column_selection', ColumnSelection(all_columns)),
    ('multiple_mean_encoder', MultipleMeanEncoder(multiple_mean_columns)),
    ('mean_encoder', MeanEncoder(mean_columns)),
    ('column_drop', ColumnDrop(mean_columns + multiple_mean_columns_flatten))
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=100, min_samples_leaf=4, min_samples_split=8)),
])

In [21]:
pipeline.fit(X_train, y_train)

In [22]:
def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [23]:
y_pred = pipeline.predict(X_test)

In [24]:
metric(y_pred, y_test)

64.01342760105412

Biggest fails, difference:
- Specific Date: 11/02/2013 03/03/2014 + surrounding days

In [25]:
import pickle

In [26]:
# open a file, where you ant to store the data
file = open('models/final_model', 'wb')

# dump information to that file
pickle.dump(pipeline, file)

# close the file
file.close()