In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Define columns for rolling statistics
columns_for_rolling_stats = [
    'Motor Speed (RPM)', 'Engine Speed (RPM)', 'Throttle (%)', 'Intake Temperature (C)',
    'Engine Coolant Temperature 1 (C)', 'Barometric Pressure (kpa)', 'Fuel Trim',
    'Fuel Consumption (g/min)', 'Expected BSFC (g/kW.hr)', 'Bus Voltage (V)',
    'GCU Current (A)', 'Battery Current (A)', 'Power Generated (W)',
    'Inverter Temperature (C)', 'Target Fuel Pressure (bar)', 'Fuel Pressure (bar)',
    'Fuel Pump Speed (RPM)', 'Cooling Pump Speed (RPM)', 'Fans On (qty)', 'PWM Uptime (s)'
]

# Function to load and preprocess datasets
def load_and_preprocess(file_path):
    data = pd.read_csv(file_path)
    data.columns = data.columns.str.strip()

    # Ensure no duplicate columns
    if data.columns.duplicated().any():
        data = data.loc[:, ~data.columns.duplicated()]

    # Rename timestamp column to 'Time (s)' if 'time (s)' exists
    if 'time (s)' in data.columns and 'Time (s)' not in data.columns:
        data.rename(columns={'time (s)': 'Time (s)'}, inplace=True)

    # Sort by 'Time (s)' if it exists
    if 'Time (s)' in data.columns:
        data = data.sort_values('Time (s)')
    
    # Define rolling window size
    window_size = 5

    # Create rolling statistics
    for column in columns_for_rolling_stats:
        if column in data.columns:
            data[f'{column}_rolling_mean'] = data[column].rolling(window=window_size).mean()
            data[f'{column}_rolling_max'] = data[column].rolling(window=window_size).max()
            data[f'{column}_rolling_min'] = data[column].rolling(window=window_size).min()
            data[f'{column}_rolling_std'] = data[column].rolling(window=window_size).std()

    # Replace negative values
    negative_values_columns = ['Fuel Consumption (g/min)', 'GCU Current (A)', 'Power Generated (W)']
    for column in negative_values_columns:
        if column in data.columns:
            data[column] = data[column].clip(lower=0)

    data.fillna(0, inplace=True)

    return data

# Function to process each dataset individually
def process_dataset(file):
    # Load and preprocess the dataset
    data = load_and_preprocess(file)
    
    # Define features for model training
    features = [
        'Motor Speed (RPM)', 'Engine Speed (RPM)', 'Throttle (%)', 'Intake Temperature (C)',
        'Engine Coolant Temperature 1 (C)', 'Barometric Pressure (kpa)', 'Fuel Trim',
        'Fuel Consumption (g/min)', 'Expected BSFC (g/kW.hr)', 'Bus Voltage (V)',
        'GCU Current (A)', 'Battery Current (A)', 'Power Generated (W)',
        'Inverter Temperature (C)', 'Target Fuel Pressure (bar)', 'Fuel Pressure (bar)',
        'Fuel Pump Speed (RPM)', 'Cooling Pump Speed (RPM)', 'Fans On (qty)', 'PWM Uptime (s)'
    ] + [f'{column}_rolling_mean' for column in columns_for_rolling_stats if f'{column}_rolling_mean' in data.columns]

    # Ensure only columns that exist in data are used
    features = [f for f in features if f in data.columns]

    # Split data into training and testing sets (50% each)
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)
    
    # Train Isolation Forest on training data
    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(train_data[features])
    train_data['anomaly_score'] = isolation_forest.decision_function(train_data[features])
    train_data['isolated_anomaly'] = isolation_forest.predict(train_data[features])
    train_data['isolated_anomaly'] = train_data['isolated_anomaly'].map({1: 0, -1: 1})
    
    # Apply Isolation Forest on testing data
    test_data['anomaly_score'] = isolation_forest.decision_function(test_data[features])
    test_data['isolated_anomaly'] = isolation_forest.predict(test_data[features])
    test_data['isolated_anomaly'] = test_data['isolated_anomaly'].map({1: 0, -1: 1})
    
    # Combine train and test data for clustering
    combined_data = pd.concat([train_data, test_data])

    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=2, random_state=42)
    combined_data['cluster_anomaly'] = kmeans.fit_predict(combined_data[features])
    
    # Identify the larger cluster as normal and the smaller cluster as anomaly
    normal_cluster_label = combined_data['cluster_anomaly'].value_counts().idxmax()
    combined_data['cluster_anomaly'] = combined_data['cluster_anomaly'].apply(lambda x: 1 if x == normal_cluster_label else 0)
    
    # Separate back into train and test sets
    train_data = combined_data.loc[train_data.index]
    test_data = combined_data.loc[test_data.index]

    # Label data as 'normal' or 'abnormal'
    train_data['label'] = train_data.apply(lambda row: 'abnormal' if row['cluster_anomaly'] == 0 else 'normal', axis=1)
    test_data['label'] = test_data.apply(lambda row: 'abnormal' if row['cluster_anomaly'] == 0 else 'normal', axis=1)
    
    # Save the results for the current dataset
    result_file = f"validated_anomalies_{file.split('/')[-1]}"
    test_data.to_csv(result_file, index=False)
    
    # Print the first few rows to verify
    print(f"Results for {file}:")
    print(test_data.head())
    
    # Plot Fuel Pump Speed (RPM) vs Fuel Consumption (g/min) for Isolation Forest anomalies
    if 'Fuel Pump Speed (RPM)' in test_data.columns and 'Fuel Consumption (g/min)' in test_data.columns:
        plt.figure(figsize=(14, 7))
        plt.scatter(test_data['Fuel Pump Speed (RPM)'], test_data['Fuel Consumption (g/min)'], c=test_data['isolated_anomaly'], cmap='viridis', marker='o', label='Isolation Forest')
        plt.xlabel('Fuel Pump Speed (RPM)')
        plt.ylabel('Fuel Consumption (g/min)')
        plt.title('Isolation Forest: Fuel Pump Speed (RPM) vs Fuel Consumption (g/min)')
        plt.colorbar(label='Anomaly')
        plt.legend()
        plt.grid(True)
        plt.show()

        # Plot Fuel Pump Speed (RPM) vs Fuel Consumption (g/min) for KMeans anomalies
        plt.figure(figsize=(14, 7))
        plt.scatter(test_data['Fuel Pump Speed (RPM)'], test_data['Fuel Consumption (g/min)'], c=test_data['cluster_anomaly'], cmap='viridis', marker='o', label='KMeans')
        plt.xlabel('Fuel Pump Speed (RPM)')
        plt.ylabel('Fuel Consumption (g/min)')
        plt.title('KMeans: Fuel Pump Speed (RPM) vs Fuel Consumption (g/min)')
        plt.colorbar(label='Anomaly')
        plt.legend()
        plt.grid(True)
        plt.show()

# Fetch all dataset files
dataset_files = glob.glob('dataset_*.csv')
for file in dataset_files:
    process_dataset(file)
