In [226]:
import pandas as pd

Firstly, we need to load the data.

In [227]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
oil = pd.read_csv('../data/oil.csv')
stores = pd.read_csv('../data/stores.csv')
transaction = pd.read_csv('../data/transactions.csv')
holidays = pd.read_csv('../data/holidays_events.csv')

Then we need to view the shape and the columns contained in each file / dataframe so we can better recognise how to merge the data, and what we can actually use.

In [228]:
print(f"Train data:\n {train_data.head()}")
print(f"Test data:\n {test_data.head()}")
print(f"Oil data:\n {oil.head()}")
print(f"Store data:\n {stores.head()}")
print(f"Transaction data:\n {transaction.head()}")
print(f"Holidays data:\n {holidays.head()}")

Train data:
    id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0
Test data:
         id        date  store_nbr      family  onpromotion
0  3000888  2017-08-16          1  AUTOMOTIVE            0
1  3000889  2017-08-16          1   BABY CARE            0
2  3000890  2017-08-16          1      BEAUTY            2
3  3000891  2017-08-16          1   BEVERAGES           20
4  3000892  2017-08-16          1       BOOKS            0
Oil data:
          date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20
Store data:
    store_nbr           city                           state type  cluster
0

So, the distinctive data we can use, so that we can get the best possible prediction results, is if we use as an index the store and the date where it is possible. We now need to merge the date and make sure we have a consistent time frame, so we want our input data to have a 1 day difference, between 2 data rows, we have to make sure our data is ordered by date.

In [229]:
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])
oil['date'] = pd.to_datetime(oil['date'])
transaction['date'] = pd.to_datetime(transaction['date'])
holidays['date'] = pd.to_datetime(holidays['date'])

train_data = train_data.merge(
    oil, how='left', on='date'
)


train_data = train_data.merge(
    stores,
    how='left',
    on=['store_nbr']
)

test_data = test_data.merge(
    oil, how='left', on='date'
)

test_data = test_data.merge(
    stores,
    how='left',
    on=['store_nbr']
)

We did not merge all the available data yet. If we pay attention to the dataframes, we notice that the transaction data have no available data about the dates we want to predict the value of, if we just try to merge those tables, in the test data, we will just a get a column filled with nan / null values, the only thing this can do is throw off our predictions. The data we get from this dataset though is highly valuable, so we cannot just not use it. What we can actually do is work with lags and window frames. The logic for that is to actually match future dates with past values. For the lag data, we can agree on a consistent lag time frame, and just use the say average of the last 7 days as the prediction for the current date.

In [230]:
# âœ… CORRECT: Use transactions (known in advance) to predict sales
train_data = train_data.merge(transaction, how='left', on=['date', 'store_nbr'])
test_data = test_data.merge(transaction, how='left', on=['date', 'store_nbr'])

# Combine for lag feature creation
combined = pd.concat([train_data, test_data], ignore_index=True)
combined = combined.sort_values(['store_nbr', 'family', 'date'])

# Create transaction lags (NOT sales lags!)
combined['transactions_lag_7'] = (
    combined
    .groupby(['store_nbr', 'family'])['transactions']
    .shift(7)
)

combined['transactions_roll_mean_7'] = (
    combined
    .groupby(['store_nbr', 'family'])['transactions']
    .shift(1)
    .rolling(7, min_periods=1)
    .mean()
)

# Split back
train_data = combined[combined['sales'].notna()].copy()
test_data = combined[combined['sales'].isna()].copy()
combined['transactions_lag_7'] = combined.groupby(['store_nbr'])['transactions'].shift(7)


So, the last dataset we have to combine to our testing and training set, is the holiday dataset. To do that we have we have to create a column for each type of holiday and just assign true / false values according to the dataset. One thing we have to keep in mind is if the holiday was transferred, if it was then there is no reason for that info to be passed on the features.

In [231]:
national_holidays = holidays[(holidays['locale'] == 'National') & (holidays['transferred'] == False)]
regional_holidays = holidays[(holidays['locale'] == 'Regional') & (holidays['transferred'] == False)]
local_holidays = holidays[(holidays['locale'] == 'Local') & (holidays['transferred'] == False)]

test_data['is_national_holiday'] = test_data['date'].isin(national_holidays)
train_data['is_national_holiday'] = train_data['date'].isin(national_holidays)

test_data['is_regional_holiday'] = (
    test_data
    .merge(
        regional_holidays[['date', 'locale_name']].assign(is_regional_holiday=True),
        left_on=['date', 'state'],
        right_on=['date', 'locale_name'],
        how='left'
    )['is_regional_holiday']
    .fillna(False)
)
train_data['is_regional_holiday'] = (
    train_data
    .merge(
        regional_holidays[['date', 'locale_name']].assign(is_regional_holiday=True),
        left_on=['date', 'state'],
        right_on=['date', 'locale_name'],
        how='left'
    )['is_regional_holiday']
    .fillna(False)
)

test_data['is_local_holiday'] = (
    test_data
    .merge(
        local_holidays[['date', 'locale_name']].assign(is_local_holiday=True),
        left_on=['date', 'state'],
        right_on=['date', 'locale_name'],
        how='left'
    )['is_local_holiday']
    .fillna(False)
)
train_data['is_local_holiday'] = (
    train_data
    .merge(
        local_holidays[['date', 'locale_name']].assign(is_local_holiday=True),
        left_on=['date', 'state'],
        right_on=['date', 'locale_name'],
        how='left'
    )['is_local_holiday']
    .fillna(False)
)

print(f"Test data:\n {test_data.head()}")
print(f"Train data:\n {train_data.head()}")

  .fillna(False)
  .fillna(False)
  .fillna(False)


Test data:
               id       date  store_nbr      family  sales  onpromotion  \
3000888  3000888 2017-08-16          1  AUTOMOTIVE    NaN            0   
3002670  3002670 2017-08-17          1  AUTOMOTIVE    NaN            0   
3004452  3004452 2017-08-18          1  AUTOMOTIVE    NaN            0   
3006234  3006234 2017-08-19          1  AUTOMOTIVE    NaN            0   
3008016  3008016 2017-08-20          1  AUTOMOTIVE    NaN            0   

         dcoilwtico   city      state type  cluster  transactions  \
3000888       46.80  Quito  Pichincha    D       13           NaN   
3002670       47.07  Quito  Pichincha    D       13           NaN   
3004452       48.59  Quito  Pichincha    D       13           NaN   
3006234         NaN  Quito  Pichincha    D       13           NaN   
3008016         NaN  Quito  Pichincha    D       13           NaN   

         transactions_lag_7  transactions_roll_mean_7  is_national_holiday  \
3000888              1766.0               1278.000

  .fillna(False)


We also have info about an earthquake that happened in Ecuador on April 16, 2016, so we can also add that as an additional column, we also know that workers are paid every 15th and last day of the month, so we can also get some info out of that.

In [232]:
train_data['is_natural_disaster'] = (
    (train_data['date'] == pd.to_datetime('2016-04-16')) &
    (train_data['city'] == 'Ecuador')
)

test_data['is_natural_disaster'] = (
    (test_data['date'] == pd.to_datetime('2016-04-16')) &
    (test_data['city'] == 'Ecuador')
)


train_data['is_payday'] = (
    (pd.to_datetime(train_data['date']).dt.day == 15) |
    (pd.to_datetime(train_data['date']).dt.day ==
     pd.to_datetime(train_data['date']).dt.days_in_month)
)

test_data['is_payday'] = (
    (pd.to_datetime(test_data['date']).dt.day == 15) |
    (pd.to_datetime(test_data['date']).dt.day ==
     pd.to_datetime(test_data['date']).dt.days_in_month)
)

And now we are ready to train our model and get predictions

In [233]:
from catboost import CatBoostRegressor

train_data = train_data.drop(columns=['date', 'city', 'state', 'type'])
test_data = test_data.drop(columns=['date', 'city', 'state', 'type'])

categorical_features = train_data.select_dtypes(exclude=['bool', 'number', 'datetime']).columns

print(categorical_features)

train_data = pd.get_dummies(train_data, columns=categorical_features)
test_data = pd.get_dummies(test_data, columns=categorical_features)

y_train = train_data['sales']
X_train = train_data.drop('sales', axis=1)
X_test = test_data

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=100,
    task_type='CPU',
    loss_function='RMSE'
)

print("Training model...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

submission = pd.DataFrame({
    'id': X_test['id'],
    'sales': y_pred
})

submission.to_csv('../data/catboost_submission_final.csv', index=False)

Index(['family'], dtype='object')
Training model...
0:	learn: 1059.3484663	total: 293ms	remaining: 4m 52s
100:	learn: 373.7553261	total: 24.9s	remaining: 3m 41s
200:	learn: 333.3221804	total: 49.1s	remaining: 3m 15s
300:	learn: 309.7873226	total: 1m 12s	remaining: 2m 49s
400:	learn: 294.0422762	total: 1m 36s	remaining: 2m 24s
500:	learn: 283.2038853	total: 1m 59s	remaining: 1m 59s
600:	learn: 274.7507314	total: 2m 24s	remaining: 1m 35s
700:	learn: 267.8579714	total: 2m 46s	remaining: 1m 11s
800:	learn: 261.4140938	total: 3m 10s	remaining: 47.2s
900:	learn: 256.1646113	total: 3m 35s	remaining: 23.7s
999:	learn: 251.4467666	total: 3m 59s	remaining: 0us
