In [8]:
import numpy as np
import pandas as pd
import time
import os
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from typing import List, Dict
from sklearn.metrics import make_scorer
from tqdm import tqdm
from sklearn.base import BaseEstimator, ClassifierMixin


seed = 42

In [9]:
class LOFWrapper(BaseEstimator, ClassifierMixin):
    """
    A scikit-learn-compatible wrapper for LocalOutlierFactor to work with BayesSearchCV.
    Enables supervised scoring and sets required attributes.
    """

    def __init__(self, **params):
        self.params = params
        self.model = None

    def fit(self, X, y=None):
        # novelty=True allows for a separate .predict() after fit
        self.model = LocalOutlierFactor(novelty=True, **self.params)
        self.model.fit(X)
        self.classes_ = np.array([0, 1])  # Required for scoring API compatibility
        return self

    def predict(self, X):
        return np.where(self.model.predict(X) == -1, 1, 0)

    def get_params(self, deep=True):
        return self.params

    def set_params(self, **params):
        self.params.update(params)
        return self

In [None]:
def tune_lof_bayesian_search(X_train, y_train, search_iter=30):
    """
    Use Bayesian optimization to tune LOF hyperparameters.

    Parameters
    ----------
    X_train : np.ndarray
        Scaled input features.
    y_train : np.ndarray
        Ground truth binary labels (1 = anomaly, 0 = normal).
    search_iter : int
        Number of search iterations.

    Returns
    -------
    dict
        Best hyperparameters found via optimization.
    """
    search_space = {
        "n_neighbors": Integer(5, 100),
        "leaf_size": Integer(10, 100),
        "p": Integer(1, 3),
        "metric": Categorical(["minkowski", "manhattan", "euclidean"]),
        "contamination": Real(0.01, 0.1, prior='log-uniform')
    }

    optimizer = BayesSearchCV(
        estimator=LOFWrapper(),
        search_spaces=search_space,
        scoring=make_scorer(f1_score),
        n_iter=search_iter,
        cv=3,
        verbose=0,
        n_jobs=-1
    )
    optimizer.fit(X_train, y_train)
    return optimizer.best_params_

In [4]:
def evaluate_all_buildings_lof(
    all_files,
    path,
    anomaly_type,
    search_iter=30
):
    """
    Evaluate LOF across buildings using tuned hyperparameters.

    Parameters
    ----------
    all_files : list of str
        List of .csv files per building.
    path : str
        Folder where the building files are stored.
    anomaly_type : int
        Label to match for anomalies (e.g. 2, 3, 4).
    search_iter : int
        Number of BayesSearchCV iterations.

    Returns
    -------
    pd.DataFrame
        Summary of evaluation results per building.
    """
    records = []

    for file in tqdm(all_files, desc=f"LOF Tuning for Type {anomaly_type}"):
        try:
            df = pd.read_csv(os.path.join(path, file))
            X = df['modified'].values.reshape(-1, 1)
            y = np.where(df['labels'] == anomaly_type, 1, 0)
            X_scaled = StandardScaler().fit_transform(X)

            best_params = tune_lof_bayesian_search(X_scaled, y, search_iter)
            model = LocalOutlierFactor(novelty=True, **best_params)
            model.fit(X_scaled)
            y_pred = np.where(model.predict(X_scaled) == -1, 1, 0)

            record = {
                "building_file": file,
                "f1_score": f1_score(y, y_pred, zero_division=0),
                "precision": precision_score(y, y_pred, zero_division=0),
                "recall": recall_score(y, y_pred, zero_division=0),
                **best_params
            }

        except Exception as e:
            record = {
                "building_file": file,
                "f1_score": np.nan,
                "precision": np.nan,
                "recall": np.nan,
                "error": str(e)
            }

        records.append(record)

    return pd.DataFrame(records)

In [5]:
def run_lof_evaluation_for_all_anomaly_types(
    base_path,
    anomaly_types=[2, 3, 4],
    output_dir="lof_scores",
    search_iter=30
):
    """
    Run LOF tuning + evaluation for all anomaly types (2,3,4).

    Parameters
    ----------
    base_path : str
        Path to folder containing TYPE2/, TYPE3/, etc.
    anomaly_types : list of int
        Anomaly types to evaluate.
    output_dir : str
        Where to store CSV outputs.
    search_iter : int
        Number of Bayesian search iterations.
    """
    os.makedirs(output_dir, exist_ok=True)

    for anom_type in anomaly_types:
        print(f"\n🚀 Running LOF evaluation for anomaly type {anom_type}")
        folder = os.path.join(base_path, f"TYPE{anom_type}")
        files = [f for f in os.listdir(folder) if f.endswith(".csv")]

        summary_df = evaluate_all_buildings_lof(
            all_files=files,
            path=folder,
            anomaly_type=anom_type,
            search_iter=search_iter
        )

        out_path = os.path.join(output_dir, f"lof_type{anom_type}_scores.csv")
        summary_df.to_csv(out_path, index=False)
        print(f"✅ Saved to {out_path}")


In [None]:
base_injected_path = "/data1/home/nitinvetcha/Ashwin_KM_Code/tsfm_learning/Anomaly_Injection_IISc/injection_code/src/OUTPUT_BDG2/MODIFIED"

run_lof_evaluation_for_all_anomaly_types(
    base_path=base_injected_path,
    anomaly_types=[2, 3, 4],
    output_dir="bdg2_lof_scores",
    search_iter=30
)


🚀 Running LOF evaluation for anomaly type 2


LOF Tuning for Type 2: 100%|██████████| 200/200 [1:03:38<00:00, 19.09s/it]


✅ Saved to lof_scores/lof_type2_scores.csv

🚀 Running LOF evaluation for anomaly type 3


LOF Tuning for Type 3: 100%|██████████| 200/200 [1:02:00<00:00, 18.60s/it]


✅ Saved to lof_scores/lof_type3_scores.csv

🚀 Running LOF evaluation for anomaly type 4


LOF Tuning for Type 4: 100%|██████████| 200/200 [1:01:34<00:00, 18.47s/it]

✅ Saved to lof_scores/lof_type4_scores.csv





In [10]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_and_save_score_boxplots(
    score_dir: str,
    output_dir: str = "plots",
    anomaly_types: list = [2, 3, 4],
    prefix: str = "isoforest"
) -> None:
    """
    Generate and save boxplots of F1, Precision, and Recall for each anomaly type.

    Parameters
    ----------
    score_dir : str
        Directory where the per-anomaly-type score CSVs are stored.
    output_dir : str, default "plots"
        Directory where plots will be saved.
    anomaly_types : list of int, default [2, 3, 4]
        List of anomaly types to plot.
    prefix : str, default "isoforest"
        Prefix used in the CSV filenames, e.g., "isoforest_type2_scores.csv".

    Returns
    -------
    None
    """
    os.makedirs(output_dir, exist_ok=True)

    for anomaly_type in anomaly_types:
        print(f"\n📊 Plotting results for anomaly type {anomaly_type}...")

        score_path = os.path.join(score_dir, f"{prefix}_type{anomaly_type}_scores.csv")
        if not os.path.exists(score_path):
            print(f"⚠️ File not found: {score_path}")
            continue

        df = pd.read_csv(score_path)

        if df.empty or df[["f1_score", "precision", "recall"]].isnull().all().all():
            print(f"⚠️ No valid score data for anomaly type {anomaly_type}. Skipping...")
            continue

        # Prepare data
        data = pd.DataFrame({
            "F1 Score": df["f1_score"],
            "Precision": df["precision"],
            "Recall": df["recall"]
        })
        data_melted = data.melt(var_name="Metric", value_name="Score")

        # Plot
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=data_melted, x="Metric", y="Score", palette="Set2")
        plt.title(f"Distribution of F1, Precision, and Recall — Anomaly Type {anomaly_type}")
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.tight_layout()

        out_path = os.path.join(output_dir, f"{prefix}_boxplot_type{anomaly_type}.png")
        plt.savefig(out_path)
        plt.close()

        print(f"✅ Saved boxplot to {out_path}")

In [None]:
plot_and_save_score_boxplots(
    score_dir="bdg2_lof_scores",   # or "lof_scores"
    output_dir="bdg2_lof_plots",
    anomaly_types=[2, 3, 4],
    prefix="lof"              # change to "lof" if needed
)


📊 Plotting results for anomaly type 2...
✅ Saved boxplot to plots/lof_boxplot_type2.png

📊 Plotting results for anomaly type 3...
✅ Saved boxplot to plots/lof_boxplot_type3.png

📊 Plotting results for anomaly type 4...
✅ Saved boxplot to plots/lof_boxplot_type4.png
