In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
import pandas as pd
import numpy as np
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.linear_model import LinearRegression

# 1. Load Data
path = '/kaggle/input/store-sales-time-series-forecasting/'
train_data = pd.read_csv(path + 'train.csv', parse_dates=['date'])
oil_data = pd.read_csv(path + 'oil.csv', parse_dates=['date'])
oil_data = oil_data.set_index('date').resample('D').ffill().reset_index()

# 2. Prepare Target (y)
y = train_data.set_index(['store_nbr','family', 'date']).sort_index().unstack(['store_nbr','family'])['sales']
y = y.fillna(0.0)
y.index = y.index.to_period('D')

# 3. Trend and Seasonality
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index, constant=True, order=1, additional_terms=[fourier], drop=True)
X = dp.in_sample()

# 4. ANSWERING ADDITIONAL NOTES (Adding Features) 
X['is_payday'] = (X.index.day == 15) | (X.index.day == X.index.days_in_month)
X['earthquake'] = ((X.index >= '2016-04-16') & (X.index <= '2016-05-15')).astype(int)
X = X.join(oil_data.set_index('date').to_period('D')['dcoilwtico']).ffill().fillna(0)

# 5. THE TIME SERIES METHOD: ADDING LAGS 
# We take the average sales across all stores and shift them by 16 days.
# This tells the model: "What was the general market mood 16 days ago?"
X['lag_16'] = y.mean(axis=1).shift(16).fillna(0)

# 6. Forecast Features
X_forecast = dp.out_of_sample(steps=16)
X_forecast['is_payday'] = (X_forecast.index.day == 15) | (X_forecast.index.day == X_forecast.index.days_in_month)
X_forecast['earthquake'] = 0 
X_forecast = X_forecast.join(oil_data.set_index('date').to_period('D')['dcoilwtico']).ffill().fillna(0)

# The lag for the forecast is the LAST 16 days of known training data
X_forecast['lag_16'] = y.mean(axis=1).iloc[-16:].values

# Ensure columns match perfectly
X_forecast = X_forecast.reindex(columns=X.columns, fill_value=0)

# 7. Train Model
model = LinearRegression(fit_intercept=False)
model.fit(X, y)

# 8. Predict and Format
predictions = model.predict(X_forecast)
predictions_df = pd.DataFrame(predictions, index=X_forecast.index, columns=y.columns)

submission = predictions_df.stack(['store_nbr', 'family']).reset_index()
submission.rename(columns={'level_0': 'date', 0: 'sales'}, inplace=True)

test_data = pd.read_csv(path + 'test.csv', parse_dates=['date'])
submission['date'] = submission['date'].dt.to_timestamp()

final_submission = test_data.merge(submission, on=['date', 'store_nbr', 'family'], how='left')
final_submission['sales'] = final_submission['sales'].clip(lower=0.0)
final_submission[['id', 'sales']].to_csv('submission.csv', index=False)

print("Lag 16 added! Ready to submit.")

Lag 16 added! Ready to submit.


  submission = predictions_df.stack(['store_nbr', 'family']).reset_index()


In [3]:
file= pd.read_csv('submission.csv')
print(file.head())
print(file.shape)

        id        sales
0  3000888     4.690706
1  3000889     0.000000
2  3000890     3.679359
3  3000891  2618.261384
4  3000892     0.556999
(28512, 2)
