In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

# ✅ Load Training Data
try:
    store_sales = pd.read_csv(
        "/kaggle/input/store-sales-time-series-forecasting-2/train.csv",
        usecols=['store_nbr', 'family', 'date', 'sales'],
        dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
        parse_dates=['date'],
    )
    store_sales['date'] = store_sales.date.dt.to_period('D')
    store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
    print("📌 Training Data Sample:\n", store_sales.head())
except FileNotFoundError as e:
    print(f"Error loading train.csv: {e}")

# Prepare target (y) - unstack to have store-family combinations as columns
y = store_sales['sales'].unstack(['store_nbr', 'family']).loc["2017"]

# Calculate average sales for lag feature
avg_sales = store_sales['sales'].groupby('date').mean().loc["2016":"2017"]  # Include 2016 for lag availability
lag_sales = avg_sales.shift(1).loc["2017"].rename('lag_sales')  # 1-day lag for training

# ✅ Load Holiday Data
try:
    holidays_events = pd.read_csv(
        "/kaggle/input/store-sales-time-series-forecasting-2/holidays_events.csv",
        dtype={'type': 'category', 'locale': 'category', 'locale_name': 'category', 'description': 'category', 'transferred': 'bool'},
        parse_dates=['date'],
    )
    holidays_events['date'] = holidays_events.date.dt.to_period('D')
    print("Loaded holidays_events.csv successfully")
except FileNotFoundError as e:
    print(f"Error loading holidays_events.csv: {e}")

# Filter national and regional holidays
holidays = holidays_events.query("locale in ['National', 'Regional']").loc['2017':'2017-08-31', ['description']]
holidays = holidays.assign(description=lambda x: x.description.cat.remove_unused_categories())

# ✅ Load Oil Data
try:
    oil = pd.read_csv(
        "/kaggle/input/store-sales-time-series-forecasting-2/oil.csv",
        parse_dates=['date'],
    ).set_index('date').to_period('D')
    oil['dcoilwtico'] = oil['dcoilwtico'].ffill().bfill()
    print("Loaded oil.csv successfully")
except FileNotFoundError as e:
    print(f"Error loading oil.csv: {e}")

# ✅ Prepare Training Features
fourier = CalendarFourier(freq='ME', order=4)  # Monthly seasonality
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,           # Linear trend
    seasonal=True,     # Weekly seasonality
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()

# Add New Year indicator
X['NewYear'] = (X.index.dayofyear == 1)

# Add holiday features
X_holidays = pd.get_dummies(holidays, columns=['description'], dtype=float)
X = X.join(X_holidays, on='date').fillna(0.0)

# Add oil price feature
X = X.join(oil['dcoilwtico'], on='date').ffill().bfill()

# Add lagged sales feature
X = X.join(lag_sales, on='date').fillna(avg_sales.mean())  # Fill NaN (e.g., Jan 1) with mean

# Verify no NaNs remain
if X.isnull().any().any():
    print("Warning: NaNs found in X after preprocessing:")
    print(X.isnull().sum())
    raise ValueError("NaNs still present in training features")

print("📌 Final Training Features (X):\n", X.head())

# ✅ Train Model
model = Ridge(alpha=1.0)  # Fixed alpha as baseline
model.fit(X, y)

# Check in-sample predictions
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)
print("📌 Sample Training Predictions (y_pred):\n", y_pred.head())

# ✅ Load Test Data While Preserving 'id'
try:
    df_test = pd.read_csv(
        "/kaggle/input/store-sales-time-series-forecasting-2/test.csv",
        dtype={'store_nbr': 'category', 'family': 'category'},
        parse_dates=['date'],
    )
    df_test_id = df_test[['id', 'store_nbr', 'family', 'date']].copy()
    df_test['date'] = df_test.date.dt.to_period('D')
    df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
    print("Loaded test.csv successfully")
except FileNotFoundError as e:
    print(f"Error loading test.csv: {e}")

# ✅ Generate Test Features with Iterative Lag
test_dates = df_test.index.get_level_values('date').unique()
X_test = dp.out_of_sample(steps=len(test_dates))
X_test.index = test_dates
X_test.index.name = 'date'

# Add New Year indicator
X_test['NewYear'] = (X_test.index.dayofyear == 1)

# Add holiday features
X_test = X_test.join(X_holidays, on='date').fillna(0.0)

# Add oil price feature
X_test = X_test.join(oil['dcoilwtico'], on='date').ffill().bfill()

# Add lagged sales feature for test period iteratively
last_train_sales = avg_sales.loc['2017-08-15']  # Last known average sales
y_submit = pd.DataFrame(index=X_test.index, columns=y.columns)  # Initialize submission predictions

for i, date in enumerate(test_dates):
    X_test_day = X_test.loc[[date]].copy()  # Features for current day
    if i == 0:
        # First test day (2017-08-16) uses last training day (2017-08-15)
        X_test_day['lag_sales'] = last_train_sales
    else:
        # Subsequent days use previous day's predicted average sales
        prev_date = test_dates[i - 1]
        prev_pred = y_submit.loc[prev_date].mean()  # Mean of previous day's predictions
        X_test_day['lag_sales'] = prev_pred
    
    # Ensure feature consistency
    X_test_day = X_test_day[X.columns]
    
    # Predict for current day
    y_submit.loc[date] = model.predict(X_test_day)

# Stack predictions for submission
y_submit = y_submit.stack(['store_nbr', 'family'], future_stack=True).reset_index(name='sales')

# ✅ Fix ID Loss Issue
df_test_id['date'] = df_test_id['date'].astype(str)
y_submit['date'] = y_submit['date'].astype(str)
y_submit = df_test_id.merge(y_submit, on=['store_nbr', 'family', 'date'], how='left')
y_submit['sales'] = y_submit['sales'].fillna(0)
y_submit = y_submit[['id', 'sales']]

# ✅ Verify and Save Submission
print("\n📌 Sample of submission.csv with All Test IDs Preserved:")
print(y_submit.head(10))
y_submit.to_csv('/kaggle/working/submission.csv', index=False)
print("📌 Submission file generated successfully! 📌")

📌 Training Data Sample:
                                  sales
store_nbr family     date             
1         AUTOMOTIVE 2013-01-01    0.0
                     2013-01-02    2.0
                     2013-01-03    3.0
                     2013-01-04    3.0
                     2013-01-05    5.0
Loaded holidays_events.csv successfully
Loaded oil.csv successfully
📌 Final Training Features (X):
             const  trend  s(2,7)  s(3,7)  s(4,7)  s(5,7)  s(6,7)  s(7,7)  \
date                                                                       
2017-01-01    1.0    1.0     0.0     0.0     0.0     0.0     0.0     0.0   
2017-01-02    1.0    2.0     1.0     0.0     0.0     0.0     0.0     0.0   
2017-01-03    1.0    3.0     0.0     1.0     0.0     0.0     0.0     0.0   
2017-01-04    1.0    4.0     0.0     0.0     1.0     0.0     0.0     0.0   
2017-01-05    1.0    5.0     0.0     0.0     0.0     1.0     0.0     0.0   

            sin(1,freq=ME)  cos(1,freq=ME)  sin(2,freq=ME)  cos(2,fre

  y_submit['sales'] = y_submit['sales'].fillna(0)
