In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import RobustScaler, LabelEncoder
import joblib, json, warnings
warnings.filterwarnings('ignore')

class WindTurbineInference:
    """
    Inference pipeline that mirrors the training pipeline:
    - Accepts one latest measurement (single point) + history
    - Reconstructs exact preprocessing/feature engineering
    - Builds last lookback window and forecasts
    - Returns predictions and threshold summary
    """
    def __init__(self,
                 model_path='improved_model.h5',
                 feature_scaler_path='feature_scaler.pkl',
                 target_scaler_path='target_scaler.pkl',
                 features_path='common_features.json',
                 label_encoder_path='label_encoder_alarm_system.pkl',
                 lookback_steps=384,   # 384 * 15min = 96 hours lookback
                 forecast_steps=192,   # 192 * 15min = 48 hours forecast
                 critical_temp=70.0):
        print("🔄 Loading model & scalers...")
        self.model = tf.keras.models.load_model(model_path)
        self.feature_scaler: RobustScaler = joblib.load(feature_scaler_path)
        self.target_scaler: RobustScaler = joblib.load(target_scaler_path)
        with open(features_path, 'r') as f:
            self.common_features = json.load(f)
        try:
            self.label_encoder: LabelEncoder = joblib.load(label_encoder_path)
        except Exception:
            self.label_encoder = None
            print("⚠️ Label encoder not found; will create on-the-fly if needed.")
        self.lookback_steps = lookback_steps
        self.forecast_steps = forecast_steps
        self.critical_temp = critical_temp
        print("✅ Ready.")

    # ------------ Public API ------------ #

    def predict_from_point(self, latest_point, history, history_freq='5min', actual_df=None):
        """
        Build a forecast from the latest point + recent history while mirroring the training pipeline.
        
        Parameters
        ----------
        latest_point : dict or one-row DataFrame
            Must contain 'date_time' and raw sensor fields.
        history : DataFrame or CSV path
            Recent raw data (ideally 5-min grid), used to reconstruct lookback features.
        history_freq : str
            Frequency of the history data ('5min' recommended).
        actual_df : DataFrame or None
            If provided, will be aligned (15-min) and merged to show 'actual_temperature' alongside
            predictions, and quick overlap metrics will be returned.
        
        Returns
        -------
        dict with:
            - predictions (DataFrame): timestamp | predicted_temperature | [actual_temperature] | [abs_error]
            - exceeded (bool)
            - first_exceed_time (Timestamp or None)
            - max_temperature (float)
            - max_temperature_time (Timestamp)
            - critical_temperature_threshold (float)
            - prediction_start (Timestamp)
            - prediction_end (Timestamp)
            - total_exceed_count (int)
            - overlap_metrics (dict)  # present only if actuals overlap
        """
        # --- Load & normalize inputs ---
        if isinstance(history, str):
            hist_df = pd.read_csv(history, parse_dates=['date_time'])
        else:
            hist_df = history.copy()
        hist_df['date_time'] = pd.to_datetime(hist_df['date_time'])

        latest_df = self._to_dataframe(latest_point)
        latest_df['date_time'] = pd.to_datetime(latest_df['date_time'])

        # Concatenate history + latest row
        raw = pd.concat([hist_df, latest_df], ignore_index=True).sort_values('date_time')

        # Keep just what we need (lookback + buffer) to save time/memory
        cutoff = raw['date_time'].max() - pd.Timedelta(days=15)
        raw = raw[raw['date_time'] >= cutoff].reset_index(drop=True)

        # --- Preprocess to 15-min grid (same as training) ---
        df_15 = self._preprocess_to_15min(raw, history_freq)

        # --- Feature engineering (exactly as training) ---
        df_feat = self._alarm_features(df_15)
        df_feat = self._enhanced_features(df_feat)

        # Ensure every expected feature exists (fill missing with 0.0)
        for f in self.common_features:
            if f not in df_feat.columns:
                df_feat[f] = 0.0

        # --- Build final sequence from the tail, pad if needed ---
        X, pred_anchor_time = self._sequence_from_tail(df_feat, self.common_features)

        # --- Scale with training feature scaler ---
        X_scaled = self._scale_features(X)

        # --- Predict ---
        y_scaled = self.model.predict(X_scaled, verbose=0)
        if y_scaled.ndim == 3:
            y_scaled = y_scaled.reshape(y_scaled.shape[0], -1)
        y = self.target_scaler.inverse_transform(y_scaled)[0]

        # --- Build forecast timestamps (15-min steps) ---
        prediction_times = pd.date_range(
            start=pd.Timestamp(pred_anchor_time) + pd.Timedelta(minutes=15),
            periods=len(y),
            freq='15min'
        )
        preds_df = pd.DataFrame({
            'timestamp': prediction_times,
            'predicted_temperature': y
        })

        # --- Threshold summary ---
        max_idx = int(np.argmax(y))
        max_temp = float(y[max_idx])
        max_time = prediction_times[max_idx]
        exceed_mask = y > self.critical_temp
        exceeded = bool(exceed_mask.any())
        first_exceed_time = prediction_times[np.argmax(exceed_mask)] if exceeded else None
        total_exceed_count = int(exceed_mask.sum())

        # --- Attach actuals (if provided) + quick metrics ---
        preds_with_actuals, overlap_metrics = self._attach_actuals_and_metrics(
            preds_df, actual_df, target_col='wtrm_avg_TrmTmp_Gbx'
        )

        # --- Package results ---
        summary = {
            'predictions': preds_with_actuals,
            'exceeded': exceeded,
            'first_exceed_time': first_exceed_time,
            'max_temperature': max_temp,
            'max_temperature_time': max_time,
            'critical_temperature_threshold': self.critical_temp,
            'prediction_start': pd.Timestamp(pred_anchor_time),
            'prediction_end': prediction_times[-1],
            'total_exceed_count': total_exceed_count,
            'overlap_metrics': overlap_metrics
        }

        # --- Pretty log ---
        print("\n📊 Inference Summary")
        print(f"Source point taken at: {latest_df['date_time'].iloc[0]}")
        print(f"Forecast window: {summary['prediction_start']} → {summary['prediction_end']}")
        print(f"Peak: {max_temp:.2f}°C at {max_time}")
        if exceeded:
            print(f"⚠️ Threshold {self.critical_temp}°C exceeded {total_exceed_count} times.")
            print(f"   First exceed at: {first_exceed_time}")
        else:
            print(f"✅ No exceedances over {self.critical_temp}°C")

        if overlap_metrics:
            print(f"🔁 Overlap with actuals: {overlap_metrics['overlap_points']} points")
            print(f"   MAE={overlap_metrics['mae']:.3f}°C, RMSE={overlap_metrics['rmse']:.3f}°C, Bias={overlap_metrics['bias']:.3f}°C")

        return summary


    # ------------ Helpers (mirror training) ------------ #

    def _to_dataframe(self, latest_point):
        if isinstance(latest_point, dict):
            return pd.DataFrame([latest_point])
        df = latest_point.copy()
        if len(df) != 1:
            raise ValueError("latest_point must be a single row.")
        return df

    def _preprocess_to_15min(self, df, history_freq='5min'):
        df = df.copy()
        df = df.sort_values('date_time')
        # Drop exact duplicates
        df = df.drop_duplicates(subset='date_time', keep='last')

        # Fill gaps on 5-min grid if history is 5-min (matches training)
        if history_freq == '5min':
            full5 = pd.date_range(df['date_time'].min(), df['date_time'].max(), freq='5min')
            df = df.set_index('date_time').reindex(full5).reset_index().rename(columns={'index': 'date_time'})
            df = df.fillna(method='ffill').fillna(method='bfill')

        # Encode alarm_system same as training
        if 'alarm_system' in df.columns:
            if self.label_encoder is None:
                self.label_encoder = LabelEncoder()
                df['alarm_system'] = self.label_encoder.fit_transform(df['alarm_system'].astype(str))
            else:
                try:
                    df['alarm_system'] = self.label_encoder.transform(df['alarm_system'].astype(str))
                except ValueError:
                    # map unknown labels to first known class
                    known = set(self.label_encoder.classes_)
                    df['alarm_system'] = df['alarm_system'].astype(str).apply(
                        lambda x: x if x in known else self.label_encoder.classes_[0]
                    )
                    df['alarm_system'] = self.label_encoder.transform(df['alarm_system'])

        # 15-min aggregation identical to training
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        df_num = df[['date_time'] + num_cols].set_index('date_time').resample('15min').mean()
        if 'alarm_system' in df.columns:
            alarm_15 = df[['date_time','alarm_system']].set_index('date_time').resample('15min').max()
            df_num['alarm_system'] = alarm_15['alarm_system']
        return df_num.reset_index()

    def _alarm_features(self, df, alarm_col='alarm_system', time_col='date_time'):
        df = df.copy()
        if alarm_col in df.columns:
            alarms = df[df[alarm_col] == 1][time_col].reset_index(drop=True)

            def hours_since(ts):
                past = alarms[alarms < ts]
                return (ts - past.iloc[-1]).total_seconds()/3600 if not past.empty else np.nan

            df['hours_since_last_alarm'] = df[time_col].apply(hours_since).fillna(48)
            df['recent_alarm_flag'] = (df['hours_since_last_alarm'] < 6).astype(int)
            df['alarm_frequency_24h'] = df[alarm_col].rolling(window=48).sum().fillna(0)
            df['alarm_system_lag_0.5h'] = df[alarm_col].shift(1)
            df['alarm_system_lag_2h']   = df[alarm_col].shift(4)
        else:
            df['hours_since_last_alarm'] = 48
            df['recent_alarm_flag'] = 0
            df['alarm_frequency_24h'] = 0
            df['alarm_system_lag_0.5h'] = 0
            df['alarm_system_lag_2h'] = 0
        return df
    
    # Add this helper inside WindTurbineInference class
    def _attach_actuals_and_metrics(self, preds_df, actual_df, target_col='wtrm_avg_TrmTmp_Gbx'):
        if actual_df is None or len(actual_df) == 0:
            return preds_df, {}

        df = actual_df.copy()
        df['date_time'] = pd.to_datetime(df['date_time'])
        # Resample to 15-min mean so it aligns with the forecast grid
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if target_col not in num_cols:
            # No target column available -> nothing to merge
            return preds_df, {}

        df15 = (df[['date_time', target_col]]
                .set_index('date_time')
                .resample('15min').mean()
                .reset_index()
                .rename(columns={'date_time': 'timestamp',
                                target_col: 'actual_temperature'}))

        merged = preds_df.merge(df15, on='timestamp', how='left')

        # Compute metrics on overlapping timestamps (where actuals exist)
        mask = merged['actual_temperature'].notna()
        metrics = {}
        if mask.any():
            y_true = merged.loc[mask, 'actual_temperature'].values
            y_pred = merged.loc[mask, 'predicted_temperature'].values
            mae = float(np.mean(np.abs(y_true - y_pred)))
            rmse = float(np.sqrt(np.mean((y_true - y_pred)**2)))
            bias = float(np.mean(y_pred - y_true))
            metrics = {'overlap_points': int(mask.sum()),
                    'mae': mae, 'rmse': rmse, 'bias': bias}

            # Keep a quick column for visibility
            merged.loc[mask, 'abs_error'] = np.abs(y_true - y_pred)
        else:
            merged['abs_error'] = np.nan

        return merged, metrics


    def _enhanced_features(self, df, target_col='wtrm_avg_TrmTmp_Gbx', time_col='date_time'):
        df = df.copy()

        # Lags/deltas for critical temps
        critical = ['wtrm_avg_TrmTmp_GbxBrg452','wtrm_avg_TrmTmp_GbxBrg151','wtrm_avg_TrmTmp_Gbx']
        lag_steps = [1,2,4,8,16]  # 0.5h,1h,2h,4h,8h at 15-min resolution
        for s in critical:
            if s in df.columns:
                for lag in lag_steps:
                    df[f'{s}_lag_{lag*0.5}h'] = df[s].shift(lag)
                df[f'{s}_delta_1h'] = df[s] - df[s].shift(2)
                df[f'{s}_delta_4h'] = df[s] - df[s].shift(8)

        if target_col in df.columns:
            df[f'{target_col}_rolling_mean_3.0h'] = df[target_col].rolling(window=6).mean()

        # Ops features
        for col in ['wgen_avg_Spd','wgdc_avg_TriGri_PwrAt','wtrm_avg_Gbx_OilPres']:
            if col in df.columns:
                df[f'{col}_delta_1h'] = df[col] - df[col].shift(2)
                df[f'{col}_rolling_mean_6h'] = df[col].rolling(window=12).mean()

        # Time features
        df['hour'] = df[time_col].dt.hour + df[time_col].dt.minute/60
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)
        df['day_of_week'] = df[time_col].dt.dayofweek

        df['week_of_year'] = df[time_col].dt.isocalendar().week.astype(int)
        df['week_sin'] = np.sin(2*np.pi*df['week_of_year']/52)
        df['week_cos'] = np.cos(2*np.pi*df['week_of_year']/52)

        df['month'] = df[time_col].dt.month
        df['month_sin'] = np.sin(2*np.pi*df['month']/12)
        df['month_cos'] = np.cos(2*np.pi*df['month']/12)

        # Final clean
        df = df.fillna(method='ffill').fillna(method='bfill')
        df = df.dropna().reset_index(drop=True)
        return df

    def _sequence_from_tail(self, df, feature_cols):
        if len(df) < self.lookback_steps:
            pad_needed = self.lookback_steps - len(df)
            last_row = df.iloc[-1:].copy()
            # synth pad going backwards on 15-min grid
            end_time = df['date_time'].iloc[-1]
            pad_times = pd.date_range(end=end_time - pd.Timedelta(minutes=15),
                                      periods=pad_needed, freq='-15min')[::-1]
            pad_df = pd.concat([last_row]*pad_needed, ignore_index=True)
            pad_df['date_time'] = pad_times
            df = pd.concat([pad_df, df], ignore_index=True)

        feat = df[feature_cols].values
        anchor_time = df['date_time'].iloc[-1]
        X = feat[-self.lookback_steps:].reshape(1, self.lookback_steps, len(feature_cols))
        return X, anchor_time

    def _scale_features(self, X):
        Xr = X.reshape(-1, X.shape[-1])
        Xs = self.feature_scaler.transform(Xr).reshape(X.shape)
        return Xs


In [7]:
import pandas as pd

# 1) Load a recent history slice (5-min raw)
history_df = pd.read_csv('wt84_with_alarms.csv', parse_dates=['date_time']).tail(60000)

# 2) Pick an anchor in the middle (so predictions have future actuals in file)
anchor_index = len(history_df) // 2
latest_point = history_df.iloc[[anchor_index]].to_dict(orient='records')[0]

# 3) Run inference with the full history_df as both history and actuals
pipe = WindTurbineInference()
out = pipe.predict_from_point(latest_point, history_df, actual_df=history_df)



🔄 Loading model & scalers...
✅ Ready.

📊 Inference Summary
Source point taken at: 2014-08-24 11:35:00
Forecast window: 2014-12-08 06:15:00 → 2014-12-10 06:15:00
Peak: 65.84°C at 2014-12-08 23:30:00
✅ No exceedances over 70.0°C


In [9]:

# 4) Show results
print(out['predictions'].head())  # will now have actual_temperature + abs_error
print(out['overlap_metrics'])     # MAE, RMSE, bias, etc.

            timestamp  predicted_temperature  actual_temperature  abs_error
0 2014-12-08 06:30:00              62.589314                 NaN        NaN
1 2014-12-08 06:45:00              62.591877                 NaN        NaN
2 2014-12-08 07:00:00              61.928051                 NaN        NaN
3 2014-12-08 07:15:00              61.889572                 NaN        NaN
4 2014-12-08 07:30:00              61.736004                 NaN        NaN
{}
