In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Define columns for rolling statistics
columns_for_rolling_stats = [
    'Motor Speed (RPM)', 'Engine Speed (RPM)', 'Throttle (%)', 'Intake Temperature (C)',
    'Engine Coolant Temperature 1 (C)', 'Barometric Pressure (kpa)', 'Fuel Trim',
    'Fuel Consumption (g/min)', 'Expected BSFC (g/kW.hr)', 'Bus Voltage (V)',
    'GCU Current (A)', 'Battery Current (A)', 'Power Generated (W)',
    'Inverter Temperature (C)', 'Target Fuel Pressure (bar)', 'Fuel Pressure (bar)',
    'Fuel Pump Speed (RPM)', 'Cooling Pump Speed (RPM)', 'Fans On (qty)', 'PWM Uptime (s)'
]

# Function to load and preprocess datasets
def load_and_preprocess(file_path):
    data = pd.read_csv(file_path)
    data.columns = data.columns.str.strip()

    # Ensure no duplicate columns
    if data.columns.duplicated().any():
        data = data.loc[:, ~data.columns.duplicated()]

    # Rename timestamp column to 'Time (s)' if 'time (s)' exists
    if 'time (s)' in data.columns and 'Time (s)' not in data.columns:
        data.rename(columns={'time (s)': 'Time (s)'}, inplace=True)

    # Sort by 'Time (s)' if it exists
    if 'Time (s)' in data.columns:
        data = data.sort_values('Time (s)')
    
    # Define rolling window size
    window_size = 5

    # Create rolling statistics
    for column in columns_for_rolling_stats:
        if column in data.columns:
            data[f'{column}_rolling_mean'] = data[column].rolling(window=window_size).mean()
            data[f'{column}_rolling_max'] = data[column].rolling(window=window_size).max()
            data[f'{column}_rolling_min'] = data[column].rolling(window=window_size).min()
            data[f'{column}_rolling_std'] = data[column].rolling(window=window_size).std()

    # Replace negative values
    negative_values_columns = ['Fuel Consumption (g/min)', 'GCU Current (A)', 'Power Generated (W)']
    for column in negative_values_columns:
        if column in data.columns:
            data[column] = data[column].clip(lower=0)

    data.fillna(0, inplace=True)

    return data

# Function to process each dataset individually
def process_dataset(file):
    # Load and preprocess the dataset
    data = load_and_preprocess(file)
    
    # Define features for model training
    features = [
        'Motor Speed (RPM)', 'Engine Speed (RPM)', 'Throttle (%)', 'Intake Temperature (C)',
        'Engine Coolant Temperature 1 (C)', 'Barometric Pressure (kpa)', 'Fuel Trim',
        'Fuel Consumption (g/min)', 'Expected BSFC (g/kW.hr)', 'Bus Voltage (V)',
        'GCU Current (A)', 'Battery Current (A)', 'Power Generated (W)',
        'Inverter Temperature (C)', 'Target Fuel Pressure (bar)', 'Fuel Pressure (bar)',
        'Fuel Pump Speed (RPM)', 'Cooling Pump Speed (RPM)', 'Fans On (qty)', 'PWM Uptime (s)'
    ] + [f'{column}_rolling_mean' for column in columns_for_rolling_stats if f'{column}_rolling_mean' in data.columns]

    # Ensure only columns that exist in data are used
    features = [f for f in features if f in data.columns]

    # Split data into training and testing sets (50% each)
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)
    
    # Train Isolation Forest on training data
    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(train_data[features])
    train_data['isolated_anomaly'] = isolation_forest.predict(train_data[features])
    train_data['isolated_anomaly'] = train_data['isolated_anomaly'].map({1: 0, -1: 1})
    
    # Apply Isolation Forest on testing data
    test_data['isolated_anomaly'] = isolation_forest.predict(test_data[features])
    test_data['isolated_anomaly'] = test_data['isolated_anomaly'].map({1: 0, -1: 1})
    
    # Validate with clustering
    kmeans = KMeans(n_clusters=2, random_state=42)
    train_data['cluster_anomaly'] = kmeans.fit_predict(train_data[features])
    test_data['cluster_anomaly'] = kmeans.predict(test_data[features])
    
    # Combine results of Isolation Forest and KMeans to validate anomalies
    train_data['validated_anomaly'] = train_data.apply(lambda row: 1 if row['isolated_anomaly'] == 1 and row['cluster_anomaly'] == 1 else 0, axis=1)
    test_data['validated_anomaly'] = test_data.apply(lambda row: 1 if row['isolated_anomaly'] == 1 and row['cluster_anomaly'] == 1 else 0, axis=1)
    
    # Function to validate anomalies with static rules
    def validate_anomalies(data):
        messages = []

        for i in range(len(data)):
            row = data.iloc[i]
            if i > 0:
                prev_row = data.iloc[i - 1]
            else:
                prev_row = row
            row_message = []

            # Rule checks for each column with related columns and previous values
            if 'Motor Speed (RPM)' in row:
                if row['Motor Speed (RPM)'] > 8800:
                    row_message.append('Over power consumption')
                if row['Motor Speed (RPM)'] < 3000:
                    row_message.append('Motor not started')
                if 'Inverter Temperature (C)' in row and (row['Motor Speed (RPM)'] != prev_row['Motor Speed (RPM)'] or row['Inverter Temperature (C)'] != prev_row['Inverter Temperature (C)']):
                    row_message.append('Motor speed related to inverter temperature')
                if 'Power Generated (W)' in row and (row['Motor Speed (RPM)'] != prev_row['Motor Speed (RPM)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('Motor speed related to power generated')

            if 'Engine Speed (RPM)' in row:
                if row['Engine Speed (RPM)'] > 8192:
                    row_message.append('Crank shaft pickup fail')
                if row['Engine Speed (RPM)'] < 3000:
                    row_message.append('Engine Speed too low')
                if row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']:
                    row_message.append('Engine speed anomaly')

            if 'Throttle (%)' in row:
                if row['Throttle (%)'] > 100:
                    row_message.append('More power generated')
                if row['Throttle (%)'] < 5:
                    row_message.append('Throttle too low')
                if 'Power Generated (W)' in row and (row['Throttle (%)'] != prev_row['Throttle (%)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('Throttle related to power generated')
                if row['Throttle (%)'] == 100 and prev_row['Power Generated (W)'] < row['Power Generated (W)']:
                    row_message.append('Throttle stuck at 100 and power increasing')

            if 'Intake Temperature (C)' in row:
                if row['Intake Temperature (C)'] > 45:
                    row_message.append('Less power generation')
                if row['Intake Temperature (C)'] < -40:
                    row_message.append('Intake temperature too low')
                if 'Power Generated (W)' in row and (row['Intake Temperature (C)'] != prev_row['Intake Temperature (C)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('Intake temperature related to power generated')

            if 'Engine Coolant Temperature 1 (C)' in row and 'Engine Coolant Temperature 2 (C)' in row:
                if row['Engine Coolant Temperature 1 (C)'] > 98 or row['Engine Coolant Temperature 2 (C)'] > 98:
                    row_message.append('Cool the drone')
                if row['Engine Coolant Temperature 1 (C)'] < 50 and row['Engine Coolant Temperature 2 (C)'] < 50:
                    row_message.append('Drone may restart')
                if 'Engine Speed (RPM)' in row and (row['Engine Coolant Temperature 1 (C)'] != prev_row['Engine Coolant Temperature 1 (C)'] or row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']):
                    row_message.append('Engine coolant temperature related to engine speed')
                if 'PWM Uptime (s)' in row and (row['Engine Coolant Temperature 1 (C)'] != prev_row['Engine Coolant Temperature 1 (C)'] or row['PWM Uptime (s)'] != prev_row['PWM Uptime (s)']):
                    row_message.append('Engine coolant temperature related to PWM uptime')

            if 'Barometric Pressure (kpa)' in row:
                if row['Barometric Pressure (kpa)'] < 70:
                    row_message.append('Less dense air')
                if row['Barometric Pressure (kpa)'] > 110 and prev_row['Power Generated (W)'] < row['Power Generated (W)']:
                    row_message.append('Anomaly more power')
                if 'Power Generated (W)' in row and (row['Barometric Pressure (kpa)'] != prev_row['Barometric Pressure (kpa)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('Barometric pressure related to power generated')

            if 'Fuel Trim' in row and 'Throttle (%)' in row:
                if row['Throttle (%)'] == 100:
                    if row['Fuel Trim'] > 1:
                        row_message.append('Less fuel')
                    if row['Fuel Trim'] < 1:
                        row_message.append('More fuel')
                if 'ECP' in row and (row['Fuel Trim'] != prev_row['Fuel Trim'] or row['ECP'] != prev_row['ECP']):
                    row_message.append('Fuel trim related to ECP')
                if row['Fuel Trim'] > 1.2 or row['Fuel Trim'] < 0.8:
                    row_message.append('Fuel trim anomaly')

            if 'Fuel Consumption (g/min)' in row and 'Fuel Pressure (bar)' in row:
                if row['Fuel Consumption (g/min)'] > row['Fuel Consumption (g/min)_rolling_mean']:
                    row_message.append('Fuel consumption anomaly')
                if 'Fuel Consumed (g)' in row and (row['Fuel Consumption (g/min)'] != prev_row['Fuel Consumption (g/min)'] or row['Fuel Consumed (g)'] != prev_row['Fuel Consumed (g)']):
                    row_message.append('Fuel consumption related to fuel consumed')

            if 'Expected BSFC (g/kW.hr)' in row and 'Actual BSFC (g/kW.hr)' in row:
                if row['Actual BSFC (g/kW.hr)'] > row['Expected BSFC (g/kW.hr)']:
                    row_message.append('Bad efficiency BSFC')
                if 'Engine Speed (RPM)' in row and (row['Expected BSFC (g/kW.hr)'] != prev_row['Expected BSFC (g/kW.hr)'] or row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']):
                    row_message.append('Expected BSFC related to engine speed')
                if 'Throttle (%)' in row and (row['Expected BSFC (g/kW.hr)'] != prev_row['Expected BSFC (g/kW.hr)'] or row['Throttle (%)'] != prev_row['Throttle (%)']):
                    row_message.append('Expected BSFC related to throttle')
                if 'Intake Temperature (C)' in row and (row['Expected BSFC (g/kW.hr)'] != prev_row['Expected BSFC (g/kW.hr)'] or row['Intake Temperature (C)'] != prev_row['Intake Temperature (C)']):
                    row_message.append('Expected BSFC related to intake temperature')
                if 'Barometric Pressure (kpa)' in row and (row['Expected BSFC (g/kW.hr)'] != prev_row['Expected BSFC (g/kW.hr)'] or row['Barometric Pressure (kpa)'] != prev_row['Barometric Pressure (kpa)']):
                    row_message.append('Expected BSFC related to barometric pressure')

            if 'Expected Max Power (W)' in row and 'Power Generated (W)' in row:
                if row['Throttle (%)'] == 100 and row['Power Generated (W)'] < row['Expected Max Power (W)']:
                    row_message.append('Anomaly in power generation')
                if 'Engine Speed (RPM)' in row and (row['Expected Max Power (W)'] != prev_row['Expected Max Power (W)'] or row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']):
                    row_message.append('Expected max power related to engine speed')
                if 'Intake Temperature (C)' in row and (row['Expected Max Power (W)'] != prev_row['Expected Max Power (W)'] or row['Intake Temperature (C)'] != prev_row['Intake Temperature (C)']):
                    row_message.append('Expected max power related to intake temperature')
                if 'Barometric Pressure (kpa)' in row and (row['Expected Max Power (W)'] != prev_row['Expected Max Power (W)'] or row['Barometric Pressure (kpa)'] != prev_row['Barometric Pressure (kpa)']):
                    row_message.append('Expected max power related to barometric pressure')

            if 'Battery Current (A)' in row and 'Throttle (%)' in row:
                if row['Throttle (%)'] == 100 and row['Battery Current (A)'] > 2:
                    row_message.append('Battery not providing enough current')
                if 'Power Generated (W)' in row and (row['Battery Current (A)'] != prev_row['Battery Current (A)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('Battery current related to power generated')
                if 'Bus Voltage (V)' in row and (row['Battery Current (A)'] != prev_row['Battery Current (A)'] or row['Bus Voltage (V)'] != prev_row['Bus Voltage (V)']):
                    row_message.append('Battery current related to bus voltage')

            if 'Power Generated (W)' in row:
                if row['Power Generated (W)'] < 0:
                    row_message.append('Engine started')
                if 'Engine Speed (RPM)' in row and (row['Power Generated (W)'] != prev_row['Power Generated (W)'] or row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']):
                    row_message.append('Power generated related to engine speed')
                if 'Bus Voltage (V)' in row and 'GCU Current (A)' in row and (row['Power Generated (W)'] != prev_row['Power Generated (W)'] or row['Bus Voltage (V)'] != prev_row['Bus Voltage (V)'] or row['GCU Current (A)'] != prev_row['GCU Current (A)']):
                    row_message.append('Power generated related to bus voltage and GCU current')

            if 'Inverter Temperature (C)' in row:
                if row['Inverter Temperature (C)'] > 100:
                    row_message.append('Inverter temperature anomaly')
                if 'Throttle (%)' in row and (row['Inverter Temperature (C)'] != prev_row['Inverter Temperature (C)'] or row['Throttle (%)'] != prev_row['Throttle (%)']):
                    row_message.append('Inverter temperature related to throttle')
                if 'Cooling Pump Speed (RPM)' in row and (row['Inverter Temperature (C)'] != prev_row['Inverter Temperature (C)'] or row['Cooling Pump Speed (RPM)'] != prev_row['Cooling Pump Speed (RPM)']):
                    row_message.append('Inverter temperature related to cooling pump speed')

            if 'Fuel Pressure (bar)' in row and 'Target Fuel Pressure (bar)' in row:
                if row['Fuel Pressure (bar)'] < row['Target Fuel Pressure (bar)']:
                    row_message.append('Problem in fuel feed or no fuel in system')
                if 'Fuel Pump Speed (RPM)' in row and (row['Fuel Pressure (bar)'] != prev_row['Fuel Pressure (bar)'] or row['Fuel Pump Speed (RPM)'] != prev_row['Fuel Pump Speed (RPM)']):
                    row_message.append('Fuel pressure related to fuel pump speed')

            if 'Fuel Pump Speed (RPM)' in row:
                if row['Fuel Pump Speed (RPM)'] < 8000 and row['Fuel Pressure (bar)'] < row['Target Fuel Pressure (bar)']:
                    row_message.append('Fuel feed issue')
                if row['Fuel Pump Speed (RPM)'] > 9000 and row['Fuel Pressure (bar)'] < row['Target Fuel Pressure (bar)']:
                    row_message.append('Fuel pump issue')
                if 'Motor Speed (RPM)' in row and (row['Fuel Pump Speed (RPM)'] != prev_row['Fuel Pump Speed (RPM)'] or row['Motor Speed (RPM)'] != prev_row['Motor Speed (RPM)']):
                    row_message.append('Fuel pump speed related to motor speed')

            if 'Cooling Pump Speed (RPM)' in row:
                if row['Cooling Pump Speed (RPM)'] > 9000:
                    row_message.append('Coolant blocked')
                if row['Cooling Pump Speed (RPM)'] < 8000 and row['Engine Coolant Temperature 1 (C)'] > 98:
                    row_message.append('Cooling pump speed issue')
                if 'Engine Speed (RPM)' in row and (row['Cooling Pump Speed (RPM)'] != prev_row['Cooling Pump Speed (RPM)'] or row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']):
                    row_message.append('Cooling pump speed related to engine speed')
                if 'Power Generated (W)' in row and (row['Cooling Pump Speed (RPM)'] != prev_row['Cooling Pump Speed (RPM)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('Cooling pump speed related to power generated')

            if 'PWM Uptime (s)' in row:
                if row['PWM Uptime (s)'] == 0:
                    row_message.append('Off signal received')
                if row['PWM Uptime (s)'] != 0 and row['Engine Speed (RPM)'] < 3000:
                    row_message.append('Engine off, possibly due to fault')
                if 'Engine Speed (RPM)' in row and (row['PWM Uptime (s)'] != prev_row['PWM Uptime (s)'] or row['Engine Speed (RPM)'] != prev_row['Engine Speed (RPM)']):
                    row_message.append('PWM uptime related to engine speed')
                if 'Power Generated (W)' in row and (row['PWM Uptime (s)'] != prev_row['PWM Uptime (s)'] or row['Power Generated (W)'] != prev_row['Power Generated (W)']):
                    row_message.append('PWM uptime related to power generated')
                if row['Power Generated (W)'] > 800 and row['PWM Uptime (s)'] == 0:
                    row_message.append('System shutoff during flight mode')

            # Additional simple rules for other columns
            if 'Bus Voltage (V)' in row and (row['Bus Voltage (V)'] < 42 or row['Bus Voltage (V)'] > 50.4):
                row_message.append('Bus voltage anomaly')
            if 'GCU Current (A)' in row and (row['GCU Current (A)'] < -20 or row['GCU Current (A)'] > 80):
                row_message.append('GCU current anomaly')
            if 'Target Fuel Pressure (bar)' in row and (row['Target Fuel Pressure (bar)'] < 1 or row['Target Fuel Pressure (bar)'] > 4.5):
                row_message.append('Target fuel pressure anomaly')

            # Special condition labels
            if row.get('Engine Speed (RPM)', 0) < 3000 and prev_row.get('Engine Speed (RPM)', 0) >= 3000:
                if row['PWM Uptime (s)'] == 0:
                    row_message.append('Shut down unexpectedly due to faulty off signal')
                if row['Power Generated (W)'] > 800 and row['PWM Uptime (s)'] == 0:
                    row_message.append('Shut down unexpectedly due to high power during flight mode')
                if row['Fuel Pressure (bar)'] < row['Target Fuel Pressure (bar)']:
                    row_message.append('Shut down due to fuel issue')
                if row['Fuel Trim'] < 1:
                    row_message.append('Shut down due to blocked fuel filter or injector')
                if row['Fuel Trim'] > 1:
                    row_message.append('Shut down due to blocked air filter or throttle stuck')
                if row['Engine Coolant Temperature 1 (C)'] > 98 or row['Inverter Temperature (C)'] > 100:
                    row_message.append('Shut down due to over temperature')
            
            if row.get('Power Generated (W)', 0) < prev_row.get('Power Generated (W)', 0):
                if row['Battery Current (A)'] > 2:
                    row_message.append('Low power, battery not providing enough current')
                if row['Bus Voltage (V)'] < 42:
                    row_message.append('Low power, bus voltage dropping')
                if row['Throttle (%)'] == 100 and row['Power Generated (W)'] < row['Expected Max Power (W)']:
                    row_message.append('Low power, expected max power not reached')
                if row['Fuel Pressure (bar)'] < row['Target Fuel Pressure (bar)']:
                    row_message.append('Low power, fuel pressure below target')
                if row['Engine Coolant Temperature 1 (C)'] > 91:
                    row_message.append('Low power, engine temperature too high')

            if row_message:
                messages.append('; '.join(row_message))
            else:
                messages.append('No specific anomaly detected')

            # Debug print for each message generated
            # print(f"Message for index {i}: {messages[-1]}")

        return messages

    # Apply static rule validation on testing data
    test_data['anomaly_message'] = validate_anomalies(test_data)
    
    # Only keep rows with anomalies
    test_data_anomalies = test_data[(test_data['isolated_anomaly'] == 1) | (test_data['cluster_anomaly'] == 1)]

    # Keep all columns and sort by 'Time (s)'
    test_data_anomalies = test_data_anomalies.sort_values('Time (s)')
    
    # Save the results for the current dataset
    result_file = f"validated_anomalies_{file.split('/')[-1]}"
    test_data_anomalies.to_csv(result_file, index=False)
    
    # Print the first few rows to verify
    print(f"Results for {file}:")
    print(test_data_anomalies.head())
    
    # Plot Fuel Pump Speed (RPM) vs Fuel Consumption (g/min) for Isolation Forest anomalies
    if 'Fuel Pump Speed (RPM)' in test_data.columns and 'Fuel Consumption (g/min)' in test_data.columns:
        plt.figure(figsize=(14, 7))
        plt.scatter(test_data['Fuel Pump Speed (RPM)'], test_data['Fuel Consumption (g/min)'], c=test_data['isolated_anomaly'], cmap='viridis', marker='o', label='Isolation Forest')
        plt.xlabel('Fuel Pump Speed (RPM)')
        plt.ylabel('Fuel Consumption (g/min)')
        plt.title('Isolation Forest: Fuel Pump Speed (RPM) vs Fuel Consumption (g/min)')
        plt.colorbar(label='Anomaly')
        plt.legend()
        plt.grid(True)
        plt.show()

        # Plot Fuel Pump Speed (RPM) vs Fuel Consumption (g/min) for KMeans anomalies
        plt.figure(figsize=(14, 7))
        plt.scatter(test_data['Fuel Pump Speed (RPM)'], test_data['Fuel Consumption (g/min)'], c=test_data['cluster_anomaly'], cmap='viridis', marker='o', label='KMeans')
        plt.xlabel('Fuel Pump Speed (RPM)')
        plt.ylabel('Fuel Consumption (g/min)')
        plt.title('KMeans: Fuel Pump Speed (RPM) vs Fuel Consumption (g/min)')
        plt.colorbar(label='Anomaly')
        plt.legend()
        plt.grid(True)
        plt.show()

# Fetch all dataset files
dataset_files = glob.glob('dataset_*.csv')
for file in dataset_files:
    process_dataset(file)
