In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor

Firstly, we will need to load the data from the csv files.

In [2]:
oil = pd.read_csv('../data/oil.csv')
holiday_events = pd.read_csv('../data/holidays_events.csv')
stores = pd.read_csv('../data/stores.csv')
transactions = pd.read_csv('../data/transactions.csv')
test_data = pd.read_csv('../data/test.csv')
train_data = pd.read_csv('../data/train.csv')

Then, we will need to check on the data, the data need to be in chronological order, we will need to check on the time period that the data was sampled.

In [3]:
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [4]:
holiday_events.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [5]:
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [6]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [7]:
train_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [8]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


Now we will need to combine the data.

Firstly, let's combine the training and testing data with the oil data. The data will be joined on the date column.

In [9]:
train_data = train_data.merge(
    oil[['date', 'dcoilwtico']],
    on='date',
    how='left'
).rename(columns={'dcoilwtico': 'Oil Price'})

test_data = test_data.merge(
    oil[['date', 'dcoilwtico']],
    on='date',
    how='left'
).rename(columns={'dcoilwtico': 'Oil Price'})

In [10]:
train_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,Oil Price
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,
1,1,2013-01-01,1,BABY CARE,0.0,0,
2,2,2013-01-01,1,BEAUTY,0.0,0,
3,3,2013-01-01,1,BEVERAGES,0.0,0,
4,4,2013-01-01,1,BOOKS,0.0,0,


In [199]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,Oil Price
0,3000888,2017-08-16,1,AUTOMOTIVE,0,46.8
1,3000889,2017-08-16,1,BABY CARE,0,46.8
2,3000890,2017-08-16,1,BEAUTY,2,46.8
3,3000891,2017-08-16,1,BEVERAGES,20,46.8
4,3000892,2017-08-16,1,BOOKS,0,46.8


Now, we will need to merge the training and testing data with the holiday events data, this will also be merged on the date column, adding null values if a day does not have a holiday.

In [200]:
train_data = train_data.merge(
    stores,
    on = ['store_nbr'],
    how = 'left'
)

test_data = test_data.merge(
    stores,
    on = ['store_nbr'],
    how = 'left'
)

In [201]:
train_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,Oil Price,city,state,type,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.0,0,,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.0,0,,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.0,0,,Quito,Pichincha,D,13


In [202]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,Oil Price,city,state,type,cluster
0,3000888,2017-08-16,1,AUTOMOTIVE,0,46.8,Quito,Pichincha,D,13
1,3000889,2017-08-16,1,BABY CARE,0,46.8,Quito,Pichincha,D,13
2,3000890,2017-08-16,1,BEAUTY,2,46.8,Quito,Pichincha,D,13
3,3000891,2017-08-16,1,BEVERAGES,20,46.8,Quito,Pichincha,D,13
4,3000892,2017-08-16,1,BOOKS,0,46.8,Quito,Pichincha,D,13


Now we can merge our initial data with the transaction data on the store nbr and data.But we have no transaction data for the test data set. What we can do is create new features in the data set taking into account past data. We can use a lag feature to get lagged values, we can use a rolling window average.

In [203]:
train_data = train_data.merge(transactions, on=['store_nbr', 'date'], how='left')
test_data = test_data.merge(transactions, on=['store_nbr', 'date'], how='left')

combined = pd.concat([train_data, test_data], ignore_index=True)
combined = combined.sort_values(['store_nbr', 'date'])

for lag in [7, 14, 30, 365]:
    combined[f'transactions_lag_{lag}'] = combined.groupby('store_nbr')['transactions'].shift(lag)

for window in [7, 14, 30]:
    combined[f'transactions_roll_mean_{window}'] = combined.groupby('store_nbr')['transactions'].transform(
        lambda x: x.shift(1).rolling(window, min_periods=1).mean()
    )
    combined[f'transactions_roll_std_{window}'] = combined.groupby('store_nbr')['transactions'].transform(
        lambda x: x.shift(1).rolling(window, min_periods=1).std()
    )

combined['transactions_vs_7day_avg'] = combined['transactions'] / combined['transactions_roll_mean_7']

combined['transactions_growth_7d'] = (
    combined['transactions'] / combined['transactions_lag_7'] - 1
)

train_data = combined[combined['sales'].notna()].copy()
test_data = combined[combined['sales'].isna()].copy()

MemoryError: Unable to allocate 343. MiB for an array with shape (15, 3000888) and data type float64

In [181]:
train_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,Oil Price,city,state,type,...,transactions_lag_30,transactions_lag_365,transactions_roll_mean_7,transactions_roll_std_7,transactions_roll_mean_14,transactions_roll_std_14,transactions_roll_mean_30,transactions_roll_std_30,transactions_vs_7day_avg,transactions_growth_7d
0,0,2013-01-01,1,AUTOMOTIVE,0.000000,0,,Quito,Pichincha,D,...,,,,,,,,,,
1,1,2013-01-01,1,BABY CARE,0.000000,0,,Quito,Pichincha,D,...,,,,,,,,,,
2,2,2013-01-01,1,BEAUTY,0.000000,0,,Quito,Pichincha,D,...,,,,,,,,,,
3,3,2013-01-01,1,BEVERAGES,0.000000,0,,Quito,Pichincha,D,...,,,,,,,,,,
4,4,2013-01-01,1,BOOKS,0.000000,0,,Quito,Pichincha,D,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3593,3593,2013-01-03,1,PREPARED FOODS,63.000000,0,92.97,Quito,Pichincha,D,...,2111.0,,1833.0,0.0,1833.0,0.0,1842.266667,50.755624,1.000000,0.000000
3594,3594,2013-01-03,1,PRODUCE,0.000000,0,92.97,Quito,Pichincha,D,...,1833.0,,1833.0,0.0,1833.0,0.0,1833.000000,0.000000,1.000000,0.000000
3595,3595,2013-01-03,1,SCHOOL AND OFFICE SUPPLIES,0.000000,0,92.97,Quito,Pichincha,D,...,1833.0,,1833.0,0.0,1833.0,0.0,1833.000000,0.000000,1.000000,0.000000
3596,3596,2013-01-03,1,SEAFOOD,17.366001,0,92.97,Quito,Pichincha,D,...,1833.0,,1833.0,0.0,1833.0,0.0,1833.000000,0.000000,1.000000,0.000000


In [182]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,Oil Price,city,state,type,...,transactions_lag_30,transactions_lag_365,transactions_roll_mean_7,transactions_roll_std_7,transactions_roll_mean_14,transactions_roll_std_14,transactions_roll_mean_30,transactions_roll_std_30,transactions_vs_7day_avg,transactions_growth_7d
3000888,3000888,2017-08-16,1,AUTOMOTIVE,,0,46.80,Quito,Pichincha,D,...,1693.0,1847.0,1693.0,0.0,1693.0,0.0,1693.0,0.0,,
3000889,3000889,2017-08-16,1,BABY CARE,,0,46.80,Quito,Pichincha,D,...,1693.0,1847.0,1693.0,0.0,1693.0,0.0,1693.0,0.0,,
3000890,3000890,2017-08-16,1,BEAUTY,,2,46.80,Quito,Pichincha,D,...,1693.0,1251.0,1693.0,0.0,1693.0,0.0,1693.0,0.0,,
3000891,3000891,2017-08-16,1,BEVERAGES,,20,46.80,Quito,Pichincha,D,...,1693.0,1251.0,1693.0,0.0,1693.0,0.0,1693.0,0.0,,
3000892,3000892,2017-08-16,1,BOOKS,,0,46.80,Quito,Pichincha,D,...,1693.0,1251.0,1693.0,0.0,1693.0,0.0,1693.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3004481,3004481,2017-08-18,1,PREPARED FOODS,,0,48.59,Quito,Pichincha,D,...,,1665.0,,,,,,,,
3004482,3004482,2017-08-18,1,PRODUCE,,5,48.59,Quito,Pichincha,D,...,,1665.0,,,,,,,,
3004483,3004483,2017-08-18,1,SCHOOL AND OFFICE SUPPLIES,,0,48.59,Quito,Pichincha,D,...,,1665.0,,,,,,,,
3004484,3004484,2017-08-18,1,SEAFOOD,,7,48.59,Quito,Pichincha,D,...,,1665.0,,,,,,,,


Finally, we have to merge the data with the holiday data. WE have to pay attention to the type of holiday, if the holiday is national, then all stores are closed, if it is regional then only the regional ones, and accordingly for locale and local ones.

In [183]:
def create_holiday_features(df, holiday_events):
    """Create comprehensive holiday features"""
    df = df.copy()

    # 1. ACTUAL holidays (where they're actually celebrated)
    # National holidays
    national_actual = holiday_events[
        (holiday_events['locale'] == 'National') &
        (holiday_events['transferred'] == False)
    ]['date']
    df['is_national_holiday'] = df['date'].isin(national_actual).astype(int)

    # 2. TRANSFERRED holidays (the original date that was moved)
    national_transferred = holiday_events[
        (holiday_events['locale'] == 'National') &
        (holiday_events['transferred'] == True)
    ]['date']
    df['is_transferred_holiday'] = df['date'].isin(national_transferred).astype(int)

    # 3. Regional holidays (actual)
    regional_actual = holiday_events[
        (holiday_events['locale'] == 'Regional') &
        (holiday_events['transferred'] == False)
    ]
    df = df.merge(
        regional_actual[['date', 'locale_name']],
        left_on=['date', 'state'],
        right_on=['date', 'locale_name'],
        how='left',
        indicator='_regional'
    )
    df['is_regional_holiday'] = (df['_regional'] == 'both').astype(int)
    df = df.drop(['_regional', 'locale_name'], axis=1, errors='ignore')

    # Regional transferred
    regional_transferred = holiday_events[
        (holiday_events['locale'] == 'Regional') &
        (holiday_events['transferred'] == True)
    ]
    df = df.merge(
        regional_transferred[['date', 'locale_name']],
        left_on=['date', 'state'],
        right_on=['date', 'locale_name'],
        how='left',
        indicator='_regional_trans'
    )
    df['is_regional_transferred'] = (df['_regional_trans'] == 'both').astype(int)
    df = df.drop(['_regional_trans', 'locale_name'], axis=1, errors='ignore')

    # 4. Local holidays (actual)
    local_actual = holiday_events[
        (holiday_events['locale'] == 'Local') &
        (holiday_events['transferred'] == False)
    ]
    df = df.merge(
        local_actual[['date', 'locale_name']],
        left_on=['date', 'city'],
        right_on=['date', 'locale_name'],
        how='left',
        indicator='_local'
    )
    df['is_local_holiday'] = (df['_local'] == 'both').astype(int)
    df = df.drop(['_local', 'locale_name'], axis=1, errors='ignore')

    # Local transferred
    local_transferred = holiday_events[
        (holiday_events['locale'] == 'Local') &
        (holiday_events['transferred'] == True)
    ]
    df = df.merge(
        local_transferred[['date', 'locale_name']],
        left_on=['date', 'city'],
        right_on=['date', 'locale_name'],
        how='left',
        indicator='_local_trans'
    )
    df['is_local_transferred'] = (df['_local_trans'] == 'both').astype(int)
    df = df.drop(['_local_trans', 'locale_name'], axis=1, errors='ignore')

    # 5. Combined features
    df['is_any_holiday'] = (
        df['is_national_holiday'] |
        df['is_regional_holiday'] |
        df['is_local_holiday']
    ).astype(int)

    df['is_any_transferred'] = (
        df['is_transferred_holiday'] |
        df['is_regional_transferred'] |
        df['is_local_transferred']
    ).astype(int)

    return df

# Apply to both train and test
train_data = create_holiday_features(train_data, holiday_events)
test_data = create_holiday_features(test_data, holiday_events)


In [184]:
train_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,Oil Price,city,state,type,...,transactions_vs_7day_avg,transactions_growth_7d,is_national_holiday,is_transferred_holiday,is_regional_holiday,is_regional_transferred,is_local_holiday,is_local_transferred,is_any_holiday,is_any_transferred
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Quito,Pichincha,D,...,,,1,0,0,0,0,0,1,0
1,1,2013-01-01,1,BABY CARE,0.0,0,,Quito,Pichincha,D,...,,,1,0,0,0,0,0,1,0
2,2,2013-01-01,1,BEAUTY,0.0,0,,Quito,Pichincha,D,...,,,1,0,0,0,0,0,1,0
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Quito,Pichincha,D,...,,,1,0,0,0,0,0,1,0
4,4,2013-01-01,1,BOOKS,0.0,0,,Quito,Pichincha,D,...,,,1,0,0,0,0,0,1,0


In [185]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,Oil Price,city,state,type,...,transactions_vs_7day_avg,transactions_growth_7d,is_national_holiday,is_transferred_holiday,is_regional_holiday,is_regional_transferred,is_local_holiday,is_local_transferred,is_any_holiday,is_any_transferred
0,3000888,2017-08-16,1,AUTOMOTIVE,,0,46.8,Quito,Pichincha,D,...,,,0,0,0,0,0,0,0,0
1,3000889,2017-08-16,1,BABY CARE,,0,46.8,Quito,Pichincha,D,...,,,0,0,0,0,0,0,0,0
2,3000890,2017-08-16,1,BEAUTY,,2,46.8,Quito,Pichincha,D,...,,,0,0,0,0,0,0,0,0
3,3000891,2017-08-16,1,BEVERAGES,,20,46.8,Quito,Pichincha,D,...,,,0,0,0,0,0,0,0,0
4,3000892,2017-08-16,1,BOOKS,,0,46.8,Quito,Pichincha,D,...,,,0,0,0,0,0,0,0,0


A final thing we should is get dummies for our categorical data / columns, the model we will b e using does not accept type objects as input but only int, float or boolean values, so we sill transform those string type data / columns to boolean ones.

In [188]:
train_data.drop('date', inplace=True)
train_data.drop('city', inplace=True)
train_data.drop('state', inplace=True)

test_data.drop('date', inplace=True)
test_data.drop('city', inplace=True)
test_data.drop('state', inplace=True)

categorical_features = train_data.select_dtypes(exclude=['int', 'float', 'boolean', 'datetime64']).columns
train_data = pd.get_dummies(train_data, columns=categorical_features)
test_data = pd.get_dummies(test_data, columns=categorical_features)

KeyError: "['date'] not found in axis"

Now, let us create X_train, X_test and y_train data

In [143]:
y_train = train_data['sales']
X_train = train_data.drop(columns=['sales'])
X_test = test_data

Now, we can create the model, fit it to our data, generate the final result for the test data and create our final submission file.

In [144]:
model = LGBMRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

submission = pd.DataFrame({
    'id': X_test['id'],
    'sales': y_pred
})

submission.to_csv('../data/LGBMRegressor.csv', index=False)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: date: object, family: object, city: object, state: object, type: object