In [2]:
# 04_model_energy_theft.ipynb
# AI Model for Energy Theft Detection

# ---------------------------------------------------
# 1️⃣ Import Libraries
# ---------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import shutil

# ---------------------------------------------------
# 2️⃣ Set up directories
# ---------------------------------------------------
# Create all necessary directories
DATA_DIR = Path.cwd().parent / 'data'
DATA_DIR.mkdir(parents=True, exist_ok=True)

SRC_DATA_DIR = Path.cwd().parent / 'src' / 'data'
SRC_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Function to save to both data directories
def save_to_both_locations(filename, obj, is_dataframe=True):
    if is_dataframe:
        obj.to_csv(DATA_DIR / filename, index=False)
        obj.to_csv(SRC_DATA_DIR / filename, index=False)
    else:
        joblib.dump(obj, DATA_DIR / filename)
        joblib.dump(obj, SRC_DATA_DIR / filename)
    print(f"✅ Saved {filename} to both data directories")

# ---------------------------------------------------
# 3️⃣ Load and Prepare Data
# ---------------------------------------------------
energy_path = DATA_DIR / 'energy_data_transformed.csv'

print("Loading energy data...")
energy_df = pd.read_csv(energy_path)
energy_df['date'] = pd.to_datetime(energy_df['date'])
print("✅ Data loaded successfully!")

# ---------------------------------------------------
# 3️⃣ Create Features
# ---------------------------------------------------
print("\nCreating features for anomaly detection...")

# Sort by consumer and date to ensure correct feature calculation
energy_df = energy_df.sort_values(['CONS_NO', 'date']).reset_index(drop=True)

# Rolling statistics (with handling for edge cases)
energy_df['rolling_mean_3'] = energy_df.groupby('CONS_NO')['consumption'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
).fillna(method='bfill').fillna(method='ffill')

energy_df['rolling_std_3'] = energy_df.groupby('CONS_NO')['consumption'].transform(
    lambda x: x.rolling(window=3, min_periods=1).std()
).fillna(0)  # Fill NaN with 0 for std when there's not enough data

# Percentage change (capped at ±100% to handle extreme values)
energy_df['pct_change'] = energy_df.groupby('CONS_NO')['consumption'].transform('pct_change').fillna(0)
energy_df['pct_change'] = energy_df['pct_change'].clip(-1, 1)  # Cap at ±100%

# Lagged values
energy_df['lag_1'] = energy_df.groupby('CONS_NO')['consumption'].transform(
    lambda x: x.shift(1)
).fillna(method='bfill')

energy_df['lag_2'] = energy_df.groupby('CONS_NO')['consumption'].transform(
    lambda x: x.shift(2)
).fillna(method='bfill')

# Z-score within each consumer's data (with handling for edge cases)
def safe_zscore(x):
    std = x.std()
    if std == 0:
        return np.zeros_like(x)
    return (x - x.mean()) / std

energy_df['z_score'] = energy_df.groupby('CONS_NO')['consumption'].transform(safe_zscore)

# Replace any remaining infinities with large finite values
energy_df = energy_df.replace([np.inf, -np.inf], np.nan)
energy_df = energy_df.fillna(method='ffill').fillna(method='bfill')

# ---------------------------------------------------
# 4️⃣ Prepare Data for Modeling
# ---------------------------------------------------
# Select features for anomaly detection
features = ['consumption', 'rolling_mean_3', 'rolling_std_3', 'pct_change', 
           'lag_1', 'lag_2', 'z_score']
X = energy_df[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------------------------------
# 5️⃣ Train Isolation Forest
# ---------------------------------------------------
print("\nTraining Isolation Forest model...")
iso_forest = IsolationForest(contamination=0.02, random_state=42)
energy_df['anomaly_score'] = iso_forest.fit_predict(X_scaled)

# Label anomalies: -1 → anomaly, 1 → normal
energy_df['predicted_anomaly'] = energy_df['anomaly_score'].apply(lambda x: 1 if x == -1 else 0)

# ---------------------------------------------------
# 6️⃣ Analyze Results
# ---------------------------------------------------
# Count anomalies per consumer
anomaly_counts = energy_df.groupby('CONS_NO')['predicted_anomaly'].sum().reset_index()
anomaly_counts['anomaly_percentage'] = (anomaly_counts['predicted_anomaly'] / 
                                      energy_df.groupby('CONS_NO').size() * 100)

print("\nConsumers with highest anomaly percentages:")
print(anomaly_counts.sort_values('anomaly_percentage', ascending=False).head())

# ---------------------------------------------------
# 7️⃣ Save Results to Both Locations
# ---------------------------------------------------
print("\nSaving results to both data directories...")
# Save the engineered features
save_to_both_locations('energy_features.csv', energy_df, is_dataframe=True)

# Save the model and scaler
save_to_both_locations('energy_model.joblib', iso_forest, is_dataframe=False)
save_to_both_locations('energy_scaler.joblib', scaler, is_dataframe=False)

print("\n✅ All files saved successfully!")

Loading energy data...
✅ Data loaded successfully!

Creating features for anomaly detection...


  ).fillna(method='bfill').fillna(method='ffill')
  energy_df['pct_change'] = energy_df.groupby('CONS_NO')['consumption'].transform('pct_change').fillna(0)
  ).fillna(method='bfill')
  ).fillna(method='bfill')
  energy_df = energy_df.fillna(method='ffill').fillna(method='bfill')



Training Isolation Forest model...

Consumers with highest anomaly percentages:
                            CONS_NO  predicted_anomaly  anomaly_percentage
0  0000E78A22CB04533A0D9E1F2FBEEC5D                  0                 NaN
1  0002D8E9C198E4A2B03BFA6D1E2E1B6D                  0                 NaN
2  000395F84A94D4CB2E5D4D77372CFB4D                  0                 NaN
3  000E6116D092E1C94AF3EFA5998363B0                  0                 NaN
4  00127DCB5EB56E6D3C56E88CE5816CBE                  0                 NaN

Saving results to both data directories...
✅ Saved energy_features.csv to both data directories
✅ Saved energy_model.joblib to both data directories
✅ Saved energy_scaler.joblib to both data directories

✅ All files saved successfully!
