In [None]:
!pip install openpyxl mlflow pandas scikit-learn==1.5.2 oracledb python-dotenv tensorflow scikeras xgboost lightgbm seaborn --proxy http://172.25.156.90:3128

In [None]:
from db_utils import connect_to_oracle, get_data_for_anomaly_type
from pipeline_utils import create_pipelines, evaluate_pipeline, evaluate_pipeline_oodd, create_pipelines_for_probs, evaluate_pipeline_from_probs
from sklearn.model_selection import train_test_split
import sys
sys.path.append('./src/oodd_detectors')
import mlflow
from continuous_OODD import OODDContinuousModel
from categorical_OODD import OODDCategoricalModel, OODDSmoothedCategoricalModel
from counter_OODD import CountBasedClassifier, FallbackCountClassifier
from plot_utils import compare_models_from_components
from plot_utils import plot_target_distribution
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# needs scikit-learn version 1.5.2 (default 1.6.1 throws 'super' object has no attribute '__sklearn_tags__'.
# from scikeras.wrappers import KerasClassifier
connection, cursor = connect_to_oracle()

In [None]:
df, X_cols, y_col = get_data_for_anomaly_type(
    cursor, 'HEATING_TYPE_WITH_GOODS_WITH_CHASSIS', limit=200000)
# plot_target_distribution(df, y_col)

In [None]:
# df, X_cols, y_col = get_data_for_anomaly_type(cursor, 'HEATING_TYPE', limit=200000)
model_dict = {}
X_train, X_test = train_test_split(df, test_size=0.1, random_state=42)
# print("\nOODDContinuousModel:")
# TP, FP, TN, FN = evaluate_pipeline_oodd(OODDContinuousModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3' )
# model_dict["OODDContinuousModel"] = (TP, FP, TN, FN)
# print("\nOODDCategoricalModel:")
# TP, FP, TN, FN = evaluate_pipeline_oodd(OODDCategoricalModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3')
# model_dict["OODDCategoricalModel"] = (TP, FP, TN, FN)
# print("\nOODDSmoothedCategoricalModel:")
# TP, FP, TN, FN = evaluate_pipeline_oodd(OODDSmoothedCategoricalModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3')
# model_dict["OODDSmoothedCategoricalModel"] = (TP, FP, TN, FN)
print("\nFBOD:")
TP, FP, TN, FN = evaluate_pipeline_oodd(CountBasedClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3')
model_dict["FBOD"] = (TP, FP, TN, FN)
print("\nFBODBackOff:")
TP, FP, TN, FN = evaluate_pipeline_oodd(FallbackCountClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3')
model_dict["FBODBackOff"] = (TP, FP, TN, FN)

compare_models_from_components(model_dict)

In [None]:
# model_dict = {}
le = LabelEncoder()
y = pd.Series(le.fit_transform(df[y_col]), index=df.index, name=y_col)
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.1, random_state=42)

for pipeline in create_pipelines(X_cols):
    name = 'MD:' + pipeline.steps[-1][0]
    # Evaluate the pipeline
    TP, FP, TN, FN  = evaluate_pipeline(pipeline, X_train, y_train,
                      X_test, y_test, verbose=False)
    model_dict[name] = (TP, FP, TN, FN)
compare_models_from_components(model_dict)

In [None]:
# model_dict = {}
for pipeline in create_pipelines_for_probs(X_cols):
    name = 'CBAD:' + pipeline.steps[-1][0]
    print('\n' + name)
    TP, FP, TN, FN  = evaluate_pipeline_from_probs(pipeline, X_train, y_train,  X_test, y_test, threshold=0.5, target_col=y_col)
    model_dict[name] = (TP, FP, TN, FN)
compare_models_from_components(model_dict)

In [None]:
def tahat(results: dict, name: str) -> dict:
    """
    Given a dict mapping model names to [TP, FP, TN, FN],
    computes Accuracy, Recall, Precision, F1;
    prints a TeX table of metrics;
    returns a dict with all metrics per model.
    """
    # 1) Compute metrics and print TeX table
    metrics = {}
    for model, (tp, fp, tn, fn) in results.items():
        total    = tp + fp + tn + fn
        accuracy  = (tp + tn) / total       if total else 0
        recall    = tp / (tp + fn)          if (tp + fn) else 0
        precision = tp / (tp + fp)          if (tp + fp) else 0
        f1        = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0

        # store
        metrics[model] = {
            'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn,
            'Accuracy': accuracy,
            'Recall': recall,
            'Precision': precision,
            'F1': f1
        }

    print(r"\midinsert \clabel[table" + name.replace(' ','') + "]{Model Copmarison" + name +"}")
    print(r"\ctable{l|rrrrrrr|r}{")
    print(r"Model & TP & FP & TN & FN & Accuracy & Recall & Precision & F1 \crli \tskip4pt")
    # sort models by F1
    for model, vals in sorted(metrics.items(), key=lambda item: item[1]['F1'], reverse=True):
        tp = vals['TP']; fp = vals['FP']; tn = vals['TN']; fn = vals['FN']
        
        total = tp+fp+tn+fn
        print(f"{model} & {tp/total:.3f} & {fp/total:.3f} & {tn/total:.3f} & {fn/total:.3f} "
              f"& {vals['Accuracy']:.3f} & {vals['Recall']:.3f} & {vals['Precision']:.3f} & {vals['F1']:.3f} \\cr")
    print(r"}")
    print(r"\caption/t Performance metrics per model in " + name + " (sorted by F1)")
    print(r"\endinsert")

    return metrics
tahat(model_dict, "Heating Type Anomaly")

In [None]:
model_dict = {}
X_train, X_test = train_test_split(df, test_size=0.1, random_state=42)
# print("\nOODDContinuousModel:")
# TP, FP, TN, FN = evaluate_pipeline_oodd(OODDContinuousModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3' )
# model_dict["OODDContinuousModel"] = (TP, FP, TN, FN)
# print("\nOODDCategoricalModel:")
# TP, FP, TN, FN = evaluate_pipeline_oodd(OODDCategoricalModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3')
# model_dict["OODDCategoricalModel"] = (TP, FP, TN, FN)
# print("\nOODDSmoothedCategoricalModel:")
# TP, FP, TN, FN = evaluate_pipeline_oodd(OODDSmoothedCategoricalModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical3')
# model_dict["OODDSmoothedCategoricalModel"] = (TP, FP, TN, FN)
print("\nFBOD:")
TP, FP, TN, FN = evaluate_pipeline_oodd(CountBasedClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBOD"] = (TP, FP, TN, FN)
print("\nFBODBackOff:")
TP, FP, TN, FN = evaluate_pipeline_oodd(FallbackCountClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBODBackOff"] = (TP, FP, TN, FN)

import unicodedata
import re
def preprocess_text(text):
    # Normalize and replace accented chars with ASCII equivalents
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'\s', ' ', text)
    text = text.upper()
    text = re.sub(r'[^A-Z0-9]', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

X_copy = X_train.copy()
for col in X_cols:
    X_copy[col] = X_copy[col].astype(str)
    X_copy[col] = X_copy[col].apply(preprocess_text)
X_train = X_copy

X_copy = X_test.copy()
for col in X_cols:
    X_copy[col] = X_copy[col].astype(str)
    X_copy[col] = X_copy[col].apply(preprocess_text)
X_test = X_copy

print("\nFBOD:")
TP, FP, TN, FN = evaluate_pipeline_oodd(CountBasedClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBOD-preprocess"] = (TP, FP, TN, FN)
print("\nFBODBackOff:")
TP, FP, TN, FN = evaluate_pipeline_oodd(FallbackCountClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBODBackOff-preprocess"] = (TP, FP, TN, FN)

from plot_utils import compare_models_from_components
compare_models_from_components(model_dict)

In [None]:
tahat(model_dict,"Compare preprocessing FBOD")