In [1]:
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler

# Rolling statistics for temporal patterns
window = 24  # 24 hours for daily patterns

df = pd.read_csv("../data/ETTh1.csv")
print(df.head())

df['date'] = pd.to_datetime(df['date'])

# Rate of change features
df['OT_diff'] = df['OT'].diff()
df['OT_diff_abs'] = df['OT_diff'].abs()

# Rolling statistics
df['OT_rolling_mean'] = df['OT'].rolling(window=window, center=True).mean()
df['OT_rolling_std'] = df['OT'].rolling(window=window, center=True).std()
df['OT_rolling_max'] = df['OT'].rolling(window=window, center=True).max()
df['OT_rolling_min'] = df['OT'].rolling(window=window, center=True).min()

# Load imbalance indicators
df['load_imbalance'] = df[['HUFL', 'MUFL', 'LUFL']].std(axis=1)
df['voltage_imbalance'] = df[['HULL', 'MULL', 'LULL']].std(axis=1)

P_high = np.sqrt(df['HUFL']**2 + df['HULL']**2)
P_mid  = np.sqrt(df['MUFL']**2 + df['MULL']**2)
P_low  = np.sqrt(df['LUFL']**2 + df['LULL']**2)

df['apparent_power'] = np.sqrt(P_high**2 + P_mid**2 + P_low**2)

# Thermal stress indicator (deviation from normal)
df['thermal_stress'] = (df['OT'] - df['OT_rolling_mean']) / (df['OT_rolling_std'] + 1e-6)


                  date   HUFL   HULL   MUFL   MULL   LUFL   LULL         OT
0  2016-07-01 00:00:00  5.827  2.009  1.599  0.462  4.203  1.340  30.531000
1  2016-07-01 01:00:00  5.693  2.076  1.492  0.426  4.142  1.371  27.787001
2  2016-07-01 02:00:00  5.157  1.741  1.279  0.355  3.777  1.218  27.787001
3  2016-07-01 03:00:00  5.090  1.942  1.279  0.391  3.807  1.279  25.044001
4  2016-07-01 04:00:00  5.358  1.942  1.492  0.462  3.868  1.279  21.948000
Feature Engineering Complete:
Total features: 18

New features:
['OT_diff', 'OT_diff_abs', 'OT_rolling_mean', 'OT_rolling_std', 'OT_rolling_max', 'OT_rolling_min', 'load_imbalance', 'voltage_imbalance', 'apparent_power', 'thermal_stress']


In [3]:
from sklearn.ensemble import IsolationForest

# Select features for Isolation Forest
isolation_features = ['OT', 'apparent_power', 'load_imbalance', 
                     'voltage_imbalance', 'thermal_stress']

# Remove NaN values from rolling calculations
df_clean = df.dropna(subset=isolation_features).copy()  # .copy() prevents warnings

# Standardize features
scaler_iso = StandardScaler()
X_iso = scaler_iso.fit_transform(df_clean[isolation_features])

# Train Isolation Forest
iso_forest = IsolationForest(
    contamination=0.01,  # Expected proportion of outliers
    random_state=42,
    n_estimators=100
)

# Predict on clean data
iso_predictions = iso_forest.fit_predict(X_iso)
iso_scores = iso_forest.score_samples(X_iso)

# Map back to original dataframe
df['iso_anomaly'] = 1  # Default: normal
df.loc[df_clean.index, 'iso_anomaly'] = iso_predictions
df['iso_score'] = np.nan
df.loc[df_clean.index, 'iso_score'] = iso_scores

print(f"Isolation Forest Anomalies: {(df['iso_anomaly'] == -1).sum()} "
      f"({(df['iso_anomaly'] == -1).mean()*100:.2f}%)")

Isolation Forest Anomalies: 174 (1.00%)
