# UCI Occupancy Detection - ARIMA Time Series Forecasting

This notebook demonstrates time series forecasting using the official UCI occupancy detection dataset with an ARIMA model to predict light sensor readings (as a proxy for energy consumption).

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from datetime import timedelta
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings('ignore')

## 2. Load and Clean UCI Occupancy Dataset

In [None]:
# Fetch dataset from UCI ML Repository
occupancy_detection = fetch_ucirepo(id=357)

# Get data (as pandas dataframes)
X = occupancy_detection.data.features
y = occupancy_detection.data.targets

# Combine features and targets
df = pd.concat([X, y], axis=1)

print(f"Loaded {len(df)} rows of data")
print("Columns:", df.columns.tolist())

# Clean the dataset - remove rows where 'date' doesn't look like a date
print("\nCleaning dataset...")
# Keep only rows where date column starts with '20' (year 20xx)
mask = df['date'].astype(str).str.startswith('20')
df_clean = df[mask].copy()

print(f"After cleaning: {len(df_clean)} rows (removed {len(df) - len(df_clean)} corrupted rows)")

# Convert date column to datetime and set as index
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean.set_index('date', inplace=True)

# Convert numeric columns
numeric_columns = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'Occupancy']
for col in numeric_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Remove any rows with NaN values after conversion
df_clean = df_clean.dropna()

print(f"Final cleaned dataset: {len(df_clean)} rows")
print("\nFirst 5 rows:")
print(df_clean.head())

# Use the cleaned dataset
df = df_clean

## 3. Data Preprocessing - Hourly Resampling

In [None]:
# RESAMPLING: The raw data is minutely. We need Hourly data for a clean forecast.
# We take the mean of 'Light' (Power proxy) and the max of 'Occupancy' (Did anyone enter?)
df_hourly = df.resample('H').agg({
    'Light': 'mean',      # Average light usage per hour
    'Occupancy': 'max'    # 1 if occupied at any point in the hour, else 0
}).dropna()

# Rename columns for clarity in our context
df_hourly.rename(columns={'Light': 'Power_Draw_Index'}, inplace=True)

print(f"Resampled to {len(df_hourly)} hourly data points")
print("\nHourly data sample:")
print(df_hourly.head())

# Display some basic statistics
print("\nData Statistics:")
print(df_hourly.describe())

## 4. ARIMA Modeling

In [None]:
# We will train on the first 80% of the data and predict the rest
train_size = int(len(df_hourly) * 0.80)
train_data = df_hourly['Power_Draw_Index'][:train_size]
test_data = df_hourly['Power_Draw_Index'][train_size:]

print(f"Training data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")
print(f"Training period: {train_data.index[0]} to {train_data.index[-1]}")
print(f"Test period: {test_data.index[0]} to {test_data.index[-1]}")

In [None]:
# Train ARIMA Model
# Order (2,1,1) is chosen to handle the real-world noise better than (2,1,2)
print("Training ARIMA model...")
model = ARIMA(train_data, order=(2, 1, 1))
model_fit = model.fit()

print("ARIMA Model Summary:")
print(model_fit.summary())

In [None]:
# Forecast
forecast_steps = len(test_data) + 24  # Predict test period + 24 hours future
forecast_result = model_fit.get_forecast(steps=forecast_steps)
forecast_values = forecast_result.predicted_mean
conf_int = forecast_result.conf_int()

# Clip negative predictions (Physics rule: Light cannot be negative)
forecast_values = forecast_values.apply(lambda x: max(x, 0))
conf_int[conf_int < 0] = 0

# Create time index for forecast
last_train_time = df_hourly.index[train_size - 1]
forecast_time_index = [last_train_time + timedelta(hours=x+1) for x in range(forecast_steps)]

print(f"Forecast generated for {forecast_steps} time steps")
print(f"Forecast period: {forecast_time_index[0]} to {forecast_time_index[-1]}")

## 5. Visualization - Interactive Dashboard

In [None]:
fig = go.Figure()

# Plot 1: Historical Data (Training)
fig.add_trace(go.Scatter(
    x=df_hourly.index[:train_size], 
    y=df_hourly['Power_Draw_Index'][:train_size],
    name="Historical Usage (Training)",
    line=dict(color='gray', width=1),
    hovertemplate='<b>Historical</b><br>Time: %{x}<br>Light: %{y:.1f} Lux<extra></extra>'
))

# Plot 2: Actual Observed Data (Ground Truth)
fig.add_trace(go.Scatter(
    x=df_hourly.index[train_size:], 
    y=df_hourly['Power_Draw_Index'][train_size:],
    name="Actual Observed (Test)",
    mode='lines',
    line=dict(color='orange', width=2),
    hovertemplate='<b>Actual</b><br>Time: %{x}<br>Light: %{y:.1f} Lux<extra></extra>'
))

# Plot 3: The Forecast
fig.add_trace(go.Scatter(
    x=forecast_time_index,
    y=forecast_values,
    name="ARIMA Forecast",
    line=dict(color='blue', width=3),
    hovertemplate='<b>Forecast</b><br>Time: %{x}<br>Light: %{y:.1f} Lux<extra></extra>'
))

# Plot 4: Confidence Interval
fig.add_trace(go.Scatter(
    x=forecast_time_index, 
    y=conf_int.iloc[:, 1],  # Upper Bound
    mode='lines',
    line=dict(width=0),
    showlegend=False,
    hoverinfo='skip'
))

fig.add_trace(go.Scatter(
    x=forecast_time_index, 
    y=conf_int.iloc[:, 0],  # Lower Bound
    mode='lines',
    line=dict(width=0),
    fill='tonexty',
    fillcolor='rgba(0, 0, 255, 0.2)',
    name="95% Confidence Interval",
    hovertemplate='<b>Confidence Interval</b><br>Time: %{x}<br>Range: %{y:.1f} Lux<extra></extra>'
))

fig.update_layout(
    title="UCI Occupancy Detection - Light Sensor Forecast (ARIMA Model)",
    yaxis_title="Light Intensity (Lux)",
    xaxis_title="Time",
    template="plotly_white",
    hovermode="x unified",
    height=600,
    showlegend=True
)

fig.show()
print("Interactive forecast visualization completed!")

## 6. Model Performance Analysis

In [None]:
# Calculate forecast accuracy metrics for the test period
test_forecast = forecast_values[:len(test_data)]

# Align the indices for proper comparison
test_forecast.index = test_data.index

# Calculate metrics
mae = np.mean(np.abs(test_data - test_forecast))
rmse = np.sqrt(np.mean((test_data - test_forecast)**2))
mape = np.mean(np.abs((test_data - test_forecast) / test_data)) * 100

print("=== Model Performance Metrics ===")
print(f"Mean Absolute Error (MAE): {mae:.2f} Lux")
print(f"Root Mean Square Error (RMSE): {rmse:.2f} Lux")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Display some statistics about the data
print(f"\n=== Data Statistics ===")
print(f"Training data mean: {train_data.mean():.2f} Lux")
print(f"Training data std: {train_data.std():.2f} Lux")
print(f"Test data mean: {test_data.mean():.2f} Lux")
print(f"Forecast mean: {test_forecast.mean():.2f} Lux")

# Occupancy statistics
occupancy_rate_train = df_hourly['Occupancy'][:train_size].mean() * 100
occupancy_rate_test = df_hourly['Occupancy'][train_size:].mean() * 100
print(f"\n=== Occupancy Statistics ===")
print(f"Training period occupancy rate: {occupancy_rate_train:.1f}%")
print(f"Test period occupancy rate: {occupancy_rate_test:.1f}%")

## 7. Summary and Insights

In [None]:
print("=== ARIMA Forecasting Summary ===")
print(f"• Dataset: UCI Occupancy Detection (ID: 357)")
print(f"• Total data points: {len(df):,} minutes → {len(df_hourly)} hours")
print(f"• Training period: {len(train_data)} hours")
print(f"• Test period: {len(test_data)} hours")
print(f"• ARIMA model: (2,1,1)")
print(f"• Forecast horizon: {forecast_steps} hours")
print(f"• Model accuracy (MAPE): {mape:.1f}%")

print("\n=== Key Insights ===")
print("• Light sensor readings serve as a proxy for energy consumption")
print("• ARIMA model captures daily patterns in office occupancy")
print("• Confidence intervals provide uncertainty quantification")
print("• Real-world sensor data shows natural variability and noise")
print("• Model can be used for energy management and occupancy prediction")