In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
store = pd.read_csv('data/store.csv')
train = pd.read_csv('data/train.csv')

  train = pd.read_csv('data/train.csv')


In [3]:

df = train.merge(store, on='Store')
df['Date'] = pd.to_datetime(df['Date'])
if 'Sales' in df.columns:
    df = df.loc[df['Sales'] > 0] # Only keep days when sales happened

In [4]:
df.dropna(inplace=True, subset=['Sales', 'Promo', 'StateHoliday', 'SchoolHoliday', 'DayOfWeek'])
df.drop(columns=['Customers', 'Open'], inplace=True)
df.shape

(440048, 16)

In [5]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [135]:
df.groupby('Month')['Sales'].mean()

Month
1     6398.490721
2     6558.403736
3     6927.117868
4     6891.002368
5     7008.660195
6     6835.139902
7     6919.412084
8     6593.521809
9     6366.471611
10    6476.182723
11    6901.433270
12    8593.712216
Name: Sales, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Sales', axis=1),
    df['Sales'],
    test_size=0.2,
    random_state=42
)

In [7]:
df.isna().sum()

Date                              0
Store                             0
DayOfWeek                         0
Sales                             0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
StoreType                         0
Assortment                        0
CompetitionDistance            1145
CompetitionOpenSinceMonth    139794
CompetitionOpenSinceYear     139794
Promo2                            0
Promo2SinceWeek              217001
Promo2SinceYear              217001
PromoInterval                217001
Year                              0
Month                             0
dtype: int64

In [213]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class MultipleMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_arrays):
        self.columns_arrays = columns_arrays
        self.means = []

    def fit(self, X, y):
        X = X.merge(y, left_index=True, right_index=True)
        for cols in self.columns_arrays:
            mean = X.groupby(cols, dropna=False)['Sales'].mean().rename(''.join(cols) + 'Mean')
            self.means.append(mean)
        return self
    
    def transform(self, X):
        for i, cols in enumerate(self.columns_arrays):
            X = X.reset_index().merge(self.means[i], how="left", on=cols).set_index('index')
        return X

class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.means = {}

    def fit(self, X, y):
        X = X.merge(y, left_index=True, right_index=True)
        for col in self.columns:
            self.means[col] = X.groupby(col, dropna=False)['Sales'].mean().rename(col + 'Mean')
        return self

    def transform(self, X):
        for col in self.columns:
            X = X.reset_index().merge(self.means[col], how="left", on=col).set_index('index')
        return X

class ColumnSelection(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns)

In [214]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

mean_columns = [
    'DayOfWeek',
    'Month',
    # 'Year',
    # 'Assortment',
    'StoreType',
    'StateHoliday',
    'SchoolHoliday',
    'Promo2',
    # 'CompetitionOpenSinceMonth',
    # 'CompetitionOpenSinceYear',
]
multiple_mean_columns = [['Promo', 'Store']]
multiple_mean_columns_flatten = [col for arr in multiple_mean_columns for col in arr]
all_columns = mean_columns + multiple_mean_columns_flatten

preprocessor = Pipeline([
    ('column_selection', ColumnSelection(all_columns)),
    ('multiple_mean_encoder', MultipleMeanEncoder(multiple_mean_columns)),
    ('mean_encoder', MeanEncoder(mean_columns)),
    ('column_drop', ColumnDrop(mean_columns + multiple_mean_columns_flatten))
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('lin', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=100, min_samples_leaf=16, min_samples_split=16, max_depth=20)),
])

In [215]:
X_transfrom = preprocessor.fit_transform(X_train, y_train)
X_transfrom.head()

Unnamed: 0_level_0,PromoStoreMean,DayOfWeekMean,MonthMean,StoreTypeMean,StateHolidayMean,SchoolHolidayMean,Promo2Mean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
235760,8125.597701,6893.285887,6930.537308,6662.04246,6761.534741,6780.596565,6442.878192
373048,5303.385542,5784.096858,6904.56929,6837.627138,6841.079961,6780.596565,7236.639543
514991,8409.6,8083.938518,6827.907329,6662.04246,6841.079961,6780.596565,7236.639543
451638,8393.391534,5784.096858,6904.56929,6837.627138,6841.079961,6780.596565,7236.639543
86346,5129.898477,6893.285887,6578.053982,6837.627138,6841.079961,7067.378642,6442.878192


In [216]:
pipeline.fit(X_train, y_train)

In [217]:
def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [218]:
y_pred = pipeline.predict(X_test)

In [208]:
metric(y_pred, y_test)

23.508982217759737

In [219]:
y_pred = pipeline.predict(X_train)
metric(y_pred, y_train)

27.04090193516522

Biggest fails, difference:
- Specific Date: 11/02/2013 03/03/2014 + surrounding days

In [18]:
import os
import pickle

if not os.path.exists('models'):
    os.makedirs('models')

# open a file, where you ant to store the data
file = open('models/final_model', 'wb')

# dump information to that file
pickle.dump(pipeline, file)

# close the file
file.close()