In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow import keras
from tensorflow.keras import layers


# 1. Data Ingestion & Merging

print("Loading datasets...")
# Load CSV files (Ensure these are in your working directory)
df_train = pd.read_csv('train.csv')
df_stores = pd.read_csv('stores.csv')
df_oil = pd.read_csv('oil.csv')
df_holidays = pd.read_csv('holidays_events.csv')


# Convert date columns to datetime objects
df_train['date'] = pd.to_datetime(df_train['date'])
df_oil['date'] = pd.to_datetime(df_oil['date'])


# Merge data to create one master dataframe
# Merge with Oil prices
df = pd.merge(df_train, df_oil, on='date', how='left')
# Merge with Stores data
df = pd.merge(df, df_stores, on='store_nbr', how='left')

# We filter for Store Number 1 to make this run efficiently on CPU.
df = df[df['store_nbr'] == 1].copy()

# Imputation: Fill missing oil values (Forward Fill then Backward Fill)
df['dcoilwtico'] = df['dcoilwtico'].ffill().bfill()

print(f"Data Loaded. Shape: {df.shape}")



# 2. Exploratory Data Analysis (EDA)

# 2.1 Sales over time
plt.figure(figsize=(12, 6))
df.groupby('date')['sales'].sum().plot(color='purple')
plt.title('Total Sales Over Time (Store 1)')
plt.ylabel('Sales')
plt.xlabel('Date')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show() #

# 2.2 Distribution of Sales
plt.figure(figsize=(8, 6))
plt.hist(df['sales'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Sales Target')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

# 2.3 Sales by Product Family
plt.figure(figsize=(12, 6))
top_families = df.groupby('family')['sales'].sum().sort_values(ascending=False).head(10)
sns.barplot(x=top_families.values, y=top_families.index, palette='viridis')
plt.title('Top 10 Product Families by Sales')
plt.show()

# 2.4 Correlation Matrix
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()



# 2. Preprocessing & Feature Engineering

# Extract Date Features (Crucial for Time Series)
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofweek'] = df['date'].dt.dayofweek

# Define features (X) and target (y)
X = df.drop(['sales', 'id', 'date'], axis=1)
y = df['sales']

# Identify column types
categorical_features = ['family', 'city', 'state', 'type', 'cluster']
numerical_features = ['onpromotion', 'dcoilwtico', 'day', 'month', 'year', 'dayofweek']

# Create Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit and Transform data
X_processed = preprocessor.fit_transform(X)

# Convert to dense array (Required for Neural Networks)
if hasattr(X_processed, 'toarray'):
    X_processed = X_processed.toarray()

# Data Splitting (Train / Validation / Test)
# Splitting 80% Train, 20% Test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)



# 3A. ANN BASELINE MODEL (REQUIRED)

ann_model = keras.Sequential([
    layers.Input(shape=(X_train_raw.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

ann_model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mae']
)

print("Training ANN Model...")
history_ann = ann_model.fit(
    X_train_raw, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    verbose=1
)

# ANN Evaluation
ann_pred = ann_model.predict(X_test_raw)

ann_mae = mean_absolute_error(y_test, ann_pred)
ann_rmse = np.sqrt(mean_squared_error(y_test, ann_pred))

print("\n--- ANN Evaluation ---")
print(f"ANN MAE : {ann_mae:.2f}")
print(f"ANN RMSE: {ann_rmse:.2f}")

# 4. Evaluation & Visualization

# Plot Loss Curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training MSE', color='blue')
plt.plot(history.history['val_loss'], label='Validation MSE', color='orange')
plt.title('Training & Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
