In [None]:
import pandas as pd

# Load your converted CSV file
df = pd.read_csv("household_power_consumption.csv")

# Display first few rows and column types
print(df.head())
print(df.dtypes)

In [None]:
import pandas as pd
dataframe = pd.read_csv('household_power_consumption.csv')
dataframe

In [None]:
import pandas as pd

# Load data
df = pd.read_csv("household_power_consumption.csv")

# Combine Date and Time into datetime format
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')

# Drop original Date and Time columns
df.drop(['Date', 'Time'], axis=1, inplace=True)

# Numeric columns to convert
numeric_columns = [
    'Global_active_power', 'Global_reactive_power', 'Voltage',
    'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'
]

# Convert to numeric (float)
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print(df.dtypes)


In [None]:
print(df.head())
print(df.dtypes)


In [None]:
# Check missing values
print("Missing values:\n", df.isnull().sum())

# Handle missing data (interpolation recommended for time-series)
df.interpolate(method='linear', inplace=True)

# Confirm no missing values remain
print("Remaining missing values:\n", df.isnull().sum())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define numeric columns
numeric_columns = [
    'Global_active_power', 'Global_reactive_power', 'Voltage',
    'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'
]

# Correlation matrix
corr_matrix = df[numeric_columns].corr()

# Display correlation matrix
print("Correlation matrix:\n", corr_matrix)

# Visualization of correlation
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
# Drop redundant column
df.drop('Global_intensity', axis=1, inplace=True)

# Verify resulting columns
print(df.columns)


In [None]:
import pandas as pd

# Assuming df is already loaded, preprocessed, redundancy handled

# CO₂ emission factor for electricity in France
emission_factor_france = 0.053  

# Convert Wh to kWh and calculate CO₂ emissions for each submeter
for col in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']:
    df[f'{col}_kWh'] = df[col] / 1000  # Wh to kWh
    df[f'{col}_CO2_kg'] = df[f'{col}_kWh'] * emission_factor_france

# Define global columns explicitly
global_columns = ['DateTime', 'Global_active_power', 'Global_reactive_power', 'Voltage']

# Create separate, clearly structured DataFrames per submeter without other submeters' data
submeter_data = {}

# Sub_metering_1 dataset
submeter_data['Sub_metering_1'] = df[global_columns + [
    'Sub_metering_1', 'Sub_metering_1_kWh', 'Sub_metering_1_CO2_kg'
]].copy().rename(columns={
    'Sub_metering_1': 'Consumption_Wh',
    'Sub_metering_1_kWh': 'Consumption_kWh',
    'Sub_metering_1_CO2_kg': 'CO2_kg'
})

# Sub_metering_2 dataset
submeter_data['Sub_metering_2'] = df[global_columns + [
    'Sub_metering_2', 'Sub_metering_2_kWh', 'Sub_metering_2_CO2_kg'
]].copy().rename(columns={
    'Sub_metering_2': 'Consumption_Wh',
    'Sub_metering_2_kWh': 'Consumption_kWh',
    'Sub_metering_2_CO2_kg': 'CO2_kg'
})

# Sub_metering_3 dataset
submeter_data['Sub_metering_3'] = df[global_columns + [
    'Sub_metering_3', 'Sub_metering_3_kWh', 'Sub_metering_3_CO2_kg'
]].copy().rename(columns={
    'Sub_metering_3': 'Consumption_Wh',
    'Sub_metering_3_kWh': 'Consumption_kWh',
    'Sub_metering_3_CO2_kg': 'CO2_kg'
})

# Verify clearly each resulting dataset
for submeter, data in submeter_data.items():
    print(f"\n--- Clearly structured data for {submeter} ---")
    print(data.head())


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Dictionary to store scalers for each appliance (used for inverse transform later)
scalers = {}

# Normalize each submeter's dataset independently
for submeter, df_appliance in submeter_data.items():
    # Select only numeric feature columns (excluding DateTime)
    feature_cols = df_appliance.columns.drop('DateTime')
    
    # Initialize and apply MinMaxScaler
    scaler = MinMaxScaler()
    df_scaled = df_appliance.copy()
    df_scaled[feature_cols] = scaler.fit_transform(df_appliance[feature_cols])
    
    # Replace original with scaled version
    submeter_data[submeter] = df_scaled
    
    # Save scaler for future inverse transform (visualization or deployment)
    scalers[submeter] = scaler
    
    print(f" Normalized data for {submeter}")


In [None]:
import numpy as np

# Define sequence length
timesteps = 60  # 1 hour if data is sampled every minute

# Create sequences per submeter
appliance_sequences = {}

for submeter, df_scaled in submeter_data.items():
    feature_cols = df_scaled.columns.drop('DateTime')
    data_array = df_scaled[feature_cols].values

    # Create rolling window sequences
    sequences = []
    for i in range(len(data_array) - timesteps):
        window = data_array[i:i + timesteps]
        sequences.append(window)

    appliance_sequences[submeter] = np.array(sequences)
    print(f" Generated {len(sequences)} sequences for {submeter}")


In [None]:
# Store split data per appliance
train_test_split_data = {}

# Define train/test ratio
train_fraction = 0.8

for submeter, sequences in appliance_sequences.items():
    n = len(sequences)
    train_size = int(n * train_fraction)

    # No shuffling: chronological split
    X_train = sequences[:train_size]
    X_test = sequences[train_size:]

    train_test_split_data[submeter] = {
        'X_train': X_train,
        'X_test': X_test
    }

    print(f" {submeter} — Train: {X_train.shape}, Test: {X_test.shape}")


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense

# Optimized model (fewer units, simple structure)
def create_autoencoder(timesteps, n_features):
    input_layer = Input(shape=(timesteps, n_features))
    
    # Encoder
    encoded = LSTM(32, activation='relu', return_sequences=False)(input_layer)
    
    # Repeat vector to match input shape
    repeated = RepeatVector(timesteps)(encoded)
    
    # Decoder
    decoded = LSTM(32, activation='relu', return_sequences=True)(repeated)
    
    # Output
    output = TimeDistributed(Dense(n_features))(decoded)
    
    # Compile model
    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='mse')
    
    return model


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Stop training if no improvement for 3 epochs
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


In [None]:
autoencoders = {}
history_logs = {}

for submeter in train_test_split_data:
    X_train = train_test_split_data[submeter]['X_train']
    
    print(f"\n Training Autoencoder for {submeter}...")

    # Create model
    model = create_autoencoder(timesteps=X_train.shape[1], n_features=X_train.shape[2])
    
    # Fit model
    history = model.fit(
        X_train, X_train,                  # Autoencoder input = output
        epochs=10,
        batch_size=128,
        validation_split=0.1,
        shuffle=True,
        callbacks=[early_stopping]
    )

    # Save model and history
    autoencoders[submeter] = model
    history_logs[submeter] = history
    
    print(f" Finished training {submeter}")


In [None]:
import os

# Create a folder to store the models
save_dir = "saved_models"
os.makedirs(save_dir, exist_ok=True)

# Save each model individually
for submeter, model in autoencoders.items():
    model_path = os.path.join(save_dir, f"{submeter}_autoencoder.h5")
    model.save(model_path)
    print(f" Model for {submeter} saved to: {model_path}")


In [None]:
import joblib
import os

# Folder to store scalers
scaler_dir = "saved_scalers"
os.makedirs(scaler_dir, exist_ok=True)

# Save each scaler per submeter
for submeter, scaler in scalers.items():
    scaler_path = os.path.join(scaler_dir, f"{submeter}_scaler.joblib")
    joblib.dump(scaler, scaler_path)
    print(f" Scaler for {submeter} saved to: {scaler_path}")


In [None]:
import matplotlib.pyplot as plt

for submeter, history in history_logs.items():
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(f" Loss Curve — {submeter}")
    plt.xlabel("Epoch")
    plt.ylabel("MSE Loss")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
import numpy as np

reconstruction_errors = {}
for submeter in train_test_split_data:
    print(f"\n Detecting anomalies for {submeter}...")

    # Get model and test data
    model = autoencoders[submeter]
    X_test = train_test_split_data[submeter]['X_test']

    # Predict (reconstruct) test sequences
    X_pred = model.predict(X_test)

    # Compute MSE between original and reconstructed sequence
    mse = np.mean(np.power(X_test - X_pred, 2), axis=(1, 2))  # one error value per sequence
    reconstruction_errors[submeter] = mse

    print(f" Reconstruction errors for {submeter}: mean={mse.mean():.5f}, std={mse.std():.5f}")


In [None]:
anomaly_thresholds = {}
anomalies = {}

for submeter, mse in reconstruction_errors.items():
    # Define threshold (can be tuned!)
    threshold = np.percentile(mse, 95)
    anomaly_thresholds[submeter] = threshold

    # Flag as anomaly if reconstruction error exceeds threshold
    anomaly_flags = mse > threshold
    anomalies[submeter] = anomaly_flags

    print(f" {submeter} — Anomaly threshold: {threshold:.5f}, Total anomalies detected: {np.sum(anomaly_flags)}")


In [None]:
import matplotlib.pyplot as plt

for submeter, mse in reconstruction_errors.items():
    threshold = anomaly_thresholds[submeter]
    flags = anomalies[submeter]

    plt.figure(figsize=(14, 4))
    plt.plot(mse, label='Reconstruction Error')
    plt.axhline(y=threshold, color='red', linestyle='--', label='Anomaly Threshold')
    plt.title(f' Anomaly Detection — {submeter}')
    plt.xlabel('Test Sequence Index')
    plt.ylabel('MSE (Reconstruction Error)')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
import matplotlib.pyplot as plt

for submeter, errors in reconstruction_errors.items():
    plt.figure(figsize=(8, 4))
    plt.hist(errors, bins=50, color='steelblue')
    plt.axvline(x=np.percentile(errors, 95), color='red', linestyle='--', label='Threshold (95th percentile)')
    plt.title(f" Error Distribution — {submeter}")
    plt.xlabel("Reconstruction Error (MSE)")
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Select the submeter you want to visualize
sub = 'Sub_metering_3'

# Grab full normalized dataframe
df = submeter_data[sub]

# Get test reconstruction error info
errors = reconstruction_errors[sub]
flags = anomalies[sub]
threshold = anomaly_thresholds[sub]

# 1. Where does the test set start in the original dataframe?
test_start_idx = int(len(df) * 0.8)

# 2. Number of test sequences = number of reconstruction errors
num_test_sequences = len(errors)

# 3. The first usable sequence starts at test_start_idx + timesteps
# So we compute indices that map each test sequence to its last time step in df
sequence_indices = np.arange(test_start_idx + timesteps, test_start_idx + timesteps + num_test_sequences)

# 4. Make sure these indices do not exceed the length of the dataframe
valid_mask = sequence_indices < len(df)
sequence_indices = sequence_indices[valid_mask]
flags = flags[valid_mask]

# 5. Extract the actual consumption and timestamps from df
consumption = df.iloc[sequence_indices]['Consumption_Wh'].values
timestamps = df.iloc[sequence_indices]['DateTime'].values

# 6. Plot everything
plt.figure(figsize=(14, 5))
plt.plot(timestamps, consumption, label='Consumption (Wh)', linewidth=1)

# Highlight anomalies in red
plt.scatter(timestamps[flags], consumption[flags], color='red', label='Anomalies', s=30)

plt.title(f"⏱ Anomaly Detection on Test Set — {sub}")
plt.xlabel("Time")
plt.ylabel("Consumption (Wh)")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import plotly.graph_objs as go
import pandas as pd

sub = 'Sub_metering_1'
df = submeter_data[sub].copy()
errors = reconstruction_errors[sub]
flags = anomalies[sub]

# Align test data
test_start_idx = int(len(df) * 0.8)
sequence_indices = np.arange(test_start_idx + timesteps, test_start_idx + timesteps + len(errors))
valid_mask = sequence_indices < len(df)
sequence_indices = sequence_indices[valid_mask]
timestamps = df.iloc[sequence_indices]['DateTime'].values
consumption = df.iloc[sequence_indices]['Consumption_Wh'].values
errors = errors[valid_mask]
flags = flags[valid_mask]

# Build plotly traces
trace1 = go.Scatter(x=timestamps, y=consumption, name='Consumption (Wh)', line=dict(color='blue'))
trace2 = go.Scatter(x=timestamps[flags], y=consumption[flags], mode='markers', name='Anomalies', marker=dict(color='red', size=6))
trace3 = go.Scatter(x=timestamps, y=errors, name='Reconstruction Error', yaxis='y2', line=dict(color='orange'))

# Layout
layout = go.Layout(
    title=f' Interactive Anomaly Detection — {sub}',
    xaxis=dict(title='Timestamp'),
    yaxis=dict(title='Consumption (Wh)', side='left'),
    yaxis2=dict(title='Reconstruction Error (MSE)', overlaying='y', side='right'),
    legend=dict(x=0, y=1.1, orientation='h'),
    height=500
)

fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
fig.show()
