In [None]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

def load_renewable_energy_data(file_path):
    # Load and explore IoT sensor data from wind turbine SCADA system
   
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully: {df.shape}")
        print(" \n IoT Sensor Data Overview ")
        print(f"Total Features: {df.shape[1]}")
        print(f"Total Records: {df.shape[0]}")
        print(f"Data Types:\n{df.dtypes.value_counts()}")
        print(f"Sample Data:\n{df.head()}")
        print(f"\nMissing Values:\n{df.isnull().sum()}")
        df = df.interpolate().fillna(method='ffill').fillna(method='bfill')
        print(f"\nMissing Values after imputation: {df.isnull().sum().sum()}")
        required_columns = ['power_output', 'temperature', 'voltage', 'current', 
                            'vibration_x', 'vibration_y', 'rotational_speed', 'failure']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            print(f"Warning: Missing expected columns: {missing_cols}")
        print(f"\nClass Distribution:\n{df['failure'].value_counts(normalize=True)}")
        return df
    except FileNotFoundError:
        print("Error: Dataset file not found. Please provide a valid CSV file path.")
        return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

def preprocess_iot_sensor_data(df):
    print("\n= Data Preprocessing =")
    feature_columns = [col for col in df.columns if col not in ['failure', 'timestamp']]
    X = df[feature_columns].copy()
    y = df['failure'].copy()
    for col in X.columns:
        X[f'{col}_lag1'] = X[col].shift(1)
        X[f'{col}_rolling_mean'] = X[col].rolling(window=5).mean()
    X = X.fillna(X.mean())
    print(f"Features selected: {len(X.columns)}")
    print(f"Target distribution: Normal={sum(y==0)}, Failure={sum(y==1)}")
    print("\n1. Applying Savitzky-Golay filter")
    X_filtered = X.copy()
    for col in X.columns:
        if X[col].dtype in ['float64', 'int64']:
            try:
                X_filtered[col] = savgol_filter(X[col], window_length=21, polyorder=3)
            except:
                print(f"Warning: Could not apply Savitzky-Golay to {col}. Using original values.")
    print("2- Z-score outlier capping")
    z_scores = np.abs((X_filtered - X_filtered.mean()) / X_filtered.std())
    outlier_mask = (z_scores > 3).any(axis=1)
    print(f"Outliers detected: {sum(outlier_mask)} ({sum(outlier_mask)/len(df):.2%})")
    for col in X_filtered.columns:
        outlier_col_mask = z_scores[col] > 3
        X_filtered.loc[outlier_col_mask, col] = X_filtered[col].clip(
            lower=X_filtered[col].mean() - 3*X_filtered[col].std(),
            upper=X_filtered[col].mean() + 3*X_filtered[col].std())
    print("3. Applying StandardScaler normalization")
    scaler = StandardScaler()
    X_normalized = pd.DataFrame(
        scaler.fit_transform(X_filtered),
        columns=X_filtered.columns,
        index=X_filtered.index
    )
    print(f"Normalized data stats: Mean={X_normalized.mean().mean():.3f}, Std={X_normalized.std().mean():.3f}")
    return X_normalized, y, scaler

def create_time_series_features(X, y, sequence_length=5):
    
    # Create time series sequences for LSTM model
    
    print(f"\n=== Creating Time Series Features (sequence_length={sequence_length}) ===")
    if len(X) < sequence_length:
        print(f"Error: Dataset too small for sequence_length={sequence_length}.")
        return None, None, sequence_length
    sequences = []
    targets = []
    for i in range(sequence_length, len(X)):
        sequences.append(X.iloc[i-sequence_length:i].values)
        targets.append(y.iloc[i])
    X_sequences = np.array(sequences)
    y_sequences = np.array(targets)
    print(f"Time series shape: {X_sequences.shape}")
    print(f"Target shape: {y_sequences.shape}")
    return X_sequences, y_sequences, sequence_length

def apply_smote_split(X_processed, y, X_lstm, y_lstm, sequence_length):
    
    # Apply SMOTE and perform temporal train-test split
    print("\nPerforming Temporal Train-Test Split")
    train_size = int(0.8 * len(X_processed))
    X_train, X_test = X_processed[:train_size], X_processed[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    X_lstm_train, X_lstm_test = X_lstm[:train_size-sequence_length], X_lstm[train_size-sequence_length:]
    y_lstm_train, y_lstm_test = y_lstm[:train_size-sequence_length], y_lstm[train_size-sequence_length:]
    print("\nApplying SMOTE for Class Imbalance")
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    X_lstm_train_reshaped, y_lstm_train = smote.fit_resample(
        X_lstm_train.reshape(X_lstm_train.shape[0], -1), y_lstm_train)
    X_lstm_train = X_lstm_train_reshaped.reshape(-1, X_lstm.shape[1], X_lstm.shape[2])
    print(f"Training set: {X_train.shape}, LSTM training: {X_lstm_train.shape}")
    print(f"Test set: {X_test.shape}, LSTM test: {X_lstm_test.shape}")
    print(f"Post-SMOTE class distribution: Normal={sum(y_train==0)}, Failure={sum(y_train==1)}")
    return X_train, X_test, y_train, y_test, X_lstm_train, X_lstm_test, y_lstm_train, y_lstm_test