In [None]:
import pandas as pd
import numpy as np

In [None]:
print("Loading data...")
try:
    # Adjust paths as necessary
    train_df = pd.read_csv('train.csv', parse_dates=['date'])
    stores_df = pd.read_csv('stores.csv')
    holidays_df = pd.read_csv('holidays_events.csv', parse_dates=['date'])
    oil_df = pd.read_csv('oil.csv', parse_dates=['date'])
except FileNotFoundError as e:
    print(f"Error loading files. Ensure all CSVs are in the correct directory. Details: {e}")
    exit()

In [None]:
# Data Cleaning and Merging 
print("Cleaning and merging data...")

# Standardize 'type' column name in holidays_df to avoid conflict
holidays_df.rename(columns={'type': 'holiday_type'}, inplace=True)

# Merge stores information with training data
df = train_df.merge(stores_df, on='store_nbr', how='left')

# Merge holidays information
# Note: Some dates have multiple holidays; the merge handles this.
df = df.merge(holidays_df, on='date', how='left')

In [None]:
# Merge oil price information
# Backfill missing oil prices (often done in time series when price is constant)
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill')
df = df.merge(oil_df, on='date', how='left')

# Fill NaNs created by the merge (e.g., if a day wasn't a holiday)
df['holiday_type'] = df['holiday_type'].fillna('None')
df['locale'] = df['locale'].fillna('None')

In [None]:
# 3. Feature Engineering: Time-Based Features 
print("Creating time-based features...")

# Sort the data by store and date (CRITICAL for time series)
df = df.sort_values(by=['store_nbr', 'date']).reset_index(drop=True)

In [None]:
# Extract Core Temporal Features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek # Monday=0, Sunday=6
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['weekend'] = (df['date'].dt.dayofweek >= 5).astype(int) # 1 if Saturday/Sunday

In [None]:
# 4. Feature Engineering: Lags (For ML Models) 
# Lags help the model remember previous sales, usually on a store-item level.
print("Creating lag features...")

# Create 7-day lag for 'sales'
# We use a shift() grouped by 'store_nbr' and 'family' to ensure we only look at
# the history of that specific series.
df['sales_lag_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(7)

# Create 30-day rolling mean of 'sales'
df['sales_rolling_mean_30'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
    lambda x: x.shift(7).rolling(30).mean()
)

In [None]:
#  5. Feature Engineering: Categorical Encoding (For ML Models) 
print("Encoding categorical features...")

# Convert categorical columns to numerical using pd.get_dummies
categorical_cols = ['store_nbr', 'family', 'store_type', 'cluster', 'holiday_type', 'locale']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

# --- 6. Final Data Preparation ---
# Drop the original 'date' column as we have extracted features from it
df.drop(['date', 'id'], axis=1, inplace=True)

In [None]:
# 6. Final Data Preparation 
# Drop the original 'date' column as we have extracted features from it
df.drop(['date', 'id'], axis=1, inplace=True) 

# Handle NaNs from the lag/rolling features by dropping or imputing
# Dropping is common for the start of the time series where lags are undefined.
df.dropna(inplace=True) 

print("\n✅ Data Preparation Complete.")
print(f"Final dataset shape: {df.shape}")

In [None]:
# 7. Define Features and Target (XGBoost)
X_xgb = df.drop('sales', axis=1)
y_xgb = df['sales']
print(f"XGBoost Feature Count: {X_xgb.shape[1]}")

In [None]:
# 8. Time-Based Train/Test Split (XGBoost) 
# Use the first 90% of data for training, and the last 10% for testing.
split_point = int(len(X_xgb) * 0.9)
X_train_xgb, X_test_xgb = X_xgb.iloc[:split_point], X_xgb.iloc[split_point:]
y_train_xgb, y_test_xgb = y_xgb.iloc[:split_point], y_xgb.iloc[split_point:]

print(f"XGBoost Train/Test Split: {X_train_xgb.shape} / {X_test_xgb.shape}")

ML MODEL (XGBOOST)
## PART 1: Machine Learning Implementation (XGBoost)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

print("\n Starting XGBoost Training ")

In [None]:
# Initialize and Train Model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=500, 
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42
)

In [None]:
# Fit the model
xgb_model.fit(X_train_xgb, y_train_xgb)

In [None]:
# Make Predictions
y_pred_xgb = xgb_model.predict(X_test_xgb)

In [None]:
# Evaluate
rmse_xgb = np.sqrt(mean_squared_error(y_test_xgb, y_pred_xgb))
print(f"✅ XGBoost RMSE: {rmse_xgb:.4f}")

### PART 2: Deep Learning Implementation (LSTM & CNN)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
print("\n Starting Deep Learning Preparation")

In [None]:
# Re-load or reset data to avoid using one-hot encoded columns (which are bad for sequencing)
# A clean start is necessary for the sequencing strategy.
# We will use the 'df_pre_encode' from the earlier steps (assumed to be a version before one-hot encoding)
# Re-Do Clean Merge (without extensive get_dummies) 
train_df = pd.read_csv('train.csv', parse_dates=['date'])
stores_df = pd.read_csv('stores.csv')
holidays_df = pd.read_csv('holidays_events.csv', parse_dates=['date'])
oil_df = pd.read_csv('oil.csv', parse_dates=['date'])
holidays_df.rename(columns={'type': 'holiday_type'}, inplace=True)

In [None]:
# Create a master DataFrame for DL
df_dl = train_df.merge(stores_df, on='store_nbr', how='left')
df_dl = df_dl.merge(holidays_df, on='date', how='left')
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill')
df_dl = df_dl.merge(oil_df, on='date', how='left')

In [None]:
# Drop all non-numeric features that we won't manually engineer or encode
df_dl.drop(['id', 'family', 'store_type', 'cluster', 'holiday_type', 'locale', 'locale_name', 'description'], axis=1, inplace=True)
df_dl = df_dl.sort_values(by=['store_nbr', 'date']).reset_index(drop=True)

In [None]:
# Fill NaNs in oil and mark non-business days
df_dl['dcoilwtico'] = df_dl['dcoilwtico'].fillna(method='bfill') # Impute remaining NaNs

#  Feature Scaling (CRITICAL for DL) 
scaler = MinMaxScaler()
# Scale the target variable 'sales'
df_dl['sales_scaled'] = scaler.fit_transform(df_dl[['sales']])

In [None]:
# Sequence Creation (Windowing)
# Define sequence parameters
SEQUENCE_LENGTH = 30 # Use the past 30 days of data
FORECAST_HORIZON = 1 # Predict 1 day into the future

def create_sequences(data, sequence_length, target_col):
    """Transforms a single time series column into input/output sequences."""
    X, y = [], []
    for i in range(len(data) - sequence_length - FORECAST_HORIZON + 1):
        # Input sequence: data from t to t + SEQUENCE_LENGTH - 1
        X.append(data[i:(i + sequence_length)])
        # Output target: data at t + SEQUENCE_LENGTH
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

In [None]:
# We will focus on one store for simplicity in this example
df_single_store = df_dl[df_dl['store_nbr'] == 1].copy()

# Create sequences for the scaled sales target
X_seq, y_seq = create_sequences(df_single_store['sales_scaled'].values, SEQUENCE_LENGTH, 'sales_scaled')

# Split sequences into train/test sets
split_seq_point = int(len(X_seq) * 0.9)
X_train_seq, X_test_seq = X_seq[:split_seq_point], X_seq[split_seq_point:]
y_train_seq, y_test_seq = y_seq[:split_seq_point], y_seq[split_seq_point:]

# Reshape input for LSTM/CNN: [samples, timesteps, features]
# Since we only used one feature ('sales_scaled'), we need to add a dimension.
X_train_seq = X_train_seq.reshape(X_train_seq.shape[0], X_train_seq.shape[1], 1)
X_test_seq = X_test_seq.reshape(X_test_seq.shape[0], X_test_seq.shape[1], 1)

print(f"DL Train/Test Split (Sequence Shape): {X_train_seq.shape} / {X_test_seq.shape}")