In [None]:
import pandas as pd
import numpy as np

In [None]:
print("Loading data...")
try:
    # Adjust paths as necessary
    train_df = pd.read_csv('train.csv', parse_dates=['date'])
    stores_df = pd.read_csv('stores.csv')
    holidays_df = pd.read_csv('holidays_events.csv', parse_dates=['date'])
    oil_df = pd.read_csv('oil.csv', parse_dates=['date'])
except FileNotFoundError as e:
    print(f"Error loading files. Ensure all CSVs are in the correct directory. Details: {e}")
    exit()

In [None]:
# Data Cleaning and Merging 
print("Cleaning and merging data...")

# Standardize 'type' column name in holidays_df to avoid conflict
holidays_df.rename(columns={'type': 'holiday_type'}, inplace=True)

# Merge stores information with training data
df = train_df.merge(stores_df, on='store_nbr', how='left')

# Merge holidays information
# Note: Some dates have multiple holidays; the merge handles this.
df = df.merge(holidays_df, on='date', how='left')

In [None]:
# Merge oil price information
# Backfill missing oil prices (often done in time series when price is constant)
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill')
df = df.merge(oil_df, on='date', how='left')

# Fill NaNs created by the merge (e.g., if a day wasn't a holiday)
df['holiday_type'] = df['holiday_type'].fillna('None')
df['locale'] = df['locale'].fillna('None')

In [None]:
# 3. Feature Engineering: Time-Based Features 
print("Creating time-based features...")

# Sort the data by store and date (CRITICAL for time series)
df = df.sort_values(by=['store_nbr', 'date']).reset_index(drop=True)

In [None]:
# Extract Core Temporal Features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek # Monday=0, Sunday=6
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['weekend'] = (df['date'].dt.dayofweek >= 5).astype(int) # 1 if Saturday/Sunday