# Project 2: Rossmann Store Sales (Regression)

## Objective
Predict daily sales for Rossmann stores using historical data.
This notebook covers:
1. Data Loading
2. Exploratory Data Analysis (EDA)
3. Preprocessing (Feature Engineering)
4. Building a Deep ANN (7-8 Layers)
5. Hyperparameter Tuning
6. Training and Evaluation
7. Saving the Model

In [None]:
!pip install -q keras-tuner

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

np.random.seed(42)
tf.random.set_seed(42)

## 1. Data Loading

In [None]:
url = "https://raw.githubusercontent.com/RPI-DATA/tutorials-intro/master/rossmann-store-sales/rossmann-store-sales/train.csv"
df = pd.read_csv(url, low_memory=False)
print(f"Dataset Shape: {df.shape}")

## 2. Exploratory Data Analysis (EDA)

In [None]:
df.head()

In [None]:
# Sales Distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['Sales'], bins=50, kde=True)
plt.title('Sales Distribution')
plt.show()

In [None]:
# Sales over time (Sample Store)
df['Date'] = pd.to_datetime(df['Date'])
store_1 = df[df['Store'] == 1].sort_values('Date')

plt.figure(figsize=(12, 5))
plt.plot(store_1['Date'], store_1['Sales'])
plt.title('Sales over Time (Store 1)')
plt.show()

## 3. Preprocessing

In [None]:
# Extract Date Features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek

# Drop Date and Customers (Customers is not known at prediction time usually, but let's drop it to be safe or keep if we assume we know footfall. Usually we don't know customers. Let's drop it.)
df = df.drop(columns=['Date', 'Customers'])

# Handle Categorical Variables
# StateHoliday, SchoolHoliday are categorical
df['StateHoliday'] = df['StateHoliday'].astype(str)

le = LabelEncoder()
df['StateHoliday'] = le.fit_transform(df['StateHoliday'])

# Only use open stores with sales > 0
df = df[(df['Open'] == 1) & (df['Sales'] > 0)]

# Drop Open column as it's all 1 now
df = df.drop(columns=['Open'])

print(f"Processed Shape: {df.shape}")

In [None]:
# Separate Target
X = df.drop(columns=['Sales'])
y = df['Sales']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4 & 5. Build ANN & Hyperparameter Tuning

In [None]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(X_train_scaled.shape[1],)))
    
    # 7 to 8 Hidden Layers
    for i in range(hp.Int('num_layers', 7, 8)):
        model.add(layers.Dense(
            units=hp.Int(f'units_{i}', min_value=64, max_value=512, step=64),
            activation='relu'
        ))
        model.add(layers.Dropout(hp.Float(f'dropout_{i}', 0.0, 0.3, step=0.1)))
        
    # Output Layer (Regression)
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp.Float('lr', 1e-4, 1e-2, sampling='log')),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    return model

In [None]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=3,
    executions_per_trial=1,
    directory='my_dir',
    project_name='rossmann_tuning'
)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

tuner.search(X_train_scaled, y_train, epochs=10, validation_split=0.2, callbacks=[stop_early])

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Layers: {best_hps.get('num_layers')}")
print(f"Best LR: {best_hps.get('lr')}")

## 6. Train Best Model

In [None]:
model = tuner.hypermodel.build(best_hps)

history = model.fit(
    X_train_scaled, 
    y_train, 
    epochs=100, 
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
)

## 7. Evaluation & Visualization

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss (MSE)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
test_loss, test_mae = model.evaluate(X_test_scaled, y_test)
print(f"Test MSE: {test_loss:.2f}")
print(f"Test MAE: {test_mae:.2f}")

## 8. Save Model

In [None]:
model.save('model_2.h5')
print("Model saved as model_2.h5")