In [None]:
!pip install openpyxl mlflow pandas scikit-learn==1.5.2 oracledb python-dotenv tensorflow scikeras xgboost lightgbm seaborn --proxy http://172.25.156.90:3128

In [None]:
from db_utils import connect_to_oracle, get_data_for_anomaly_type
from pipeline_utils import create_pipelines, evaluate_pipeline, evaluate_pipeline_oodd, create_pipelines_for_probs, evaluate_pipeline_from_probs
from sklearn.model_selection import train_test_split
import sys
sys.path.append('./src/oodd_detectors')
sys.path.append('./src/preprocessors')
import mlflow
from continuous_OODD import OODDContinuousModel
from categorical_OODD import OODDCategoricalModel, OODDSmoothedCategoricalModel
from counter_OODD import CountBasedClassifier, FallbackCountClassifier
from target_preprocessor import create_target_pipeline
from plot_utils import compare_models_from_components
from plot_utils import plot_target_distribution, print_latex_table
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# needs scikit-learn version 1.5.2 (default 1.6.1 throws 'super' object has no attribute '__sklearn_tags__'.
# from scikeras.wrappers import KerasClassifier
connection, cursor = connect_to_oracle()

In [None]:
df, X_cols, y_col = get_data_for_anomaly_type(
    cursor, 'IM_SP_REF_WITHOUT_PR', limit=200000)
# plot_target_distribution(df, y_col)
y_pipe = create_target_pipeline(y_col, 'IM_SP_REF_WITHOUT_PR')
df[y_col] = y_pipe.fit_transform(df[y_col])
print(X_cols)

In [None]:
# df, X_cols, y_col = get_data_for_anomaly_type(cursor, 'HEATING_TYPE', limit=200000)
model_dict = {}
X_train, X_test = train_test_split(df, test_size=0.1, random_state=42)

print("\nFBOD:")
TP, FP, TN, FN = evaluate_pipeline_oodd(CountBasedClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBOD"] = (TP, FP, TN, FN)
print("\nFBODBackOff:")
TP, FP, TN, FN = evaluate_pipeline_oodd(FallbackCountClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBODBackOff"] = (TP, FP, TN, FN)
from text_preprocessor import TextPreprocessor
new_df = TextPreprocessor(X_cols).fit_transform(df)
X_train, X_test = train_test_split(new_df, test_size=0.1, random_state=42)
print("\nFBOD:")
TP, FP, TN, FN = evaluate_pipeline_oodd(CountBasedClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBOD-preprocess"] = (TP, FP, TN, FN)
print("\nFBODBackOff:")
TP, FP, TN, FN = evaluate_pipeline_oodd(FallbackCountClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical')
model_dict["FBODBackOff-preprocess"] = (TP, FP, TN, FN)

compare_models_from_components(model_dict)

In [None]:
# model_dict = {}
y = df[y_col]
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.1, random_state=42)

for pipeline in create_pipelines(X_cols):
    name = 'MD:' + pipeline.steps[-1][0]
    # Evaluate the pipeline
    TP, FP, TN, FN  = evaluate_pipeline(pipeline, X_train, y_train,
                      X_test, y_test, verbose=False)
    model_dict[name] = (TP, FP, TN, FN)
compare_models_from_components(model_dict)

In [None]:
# model_dict = {}
for pipeline in create_pipelines_for_probs(X_cols):
    name = 'CBAD:' + pipeline.steps[-1][0]
    print('\n' + name)
    TP, FP, TN, FN  = evaluate_pipeline_from_probs(pipeline, X_train, y_train,  X_test, y_test, threshold=0.5, target_col=y_col)
    model_dict[name] = (TP, FP, TN, FN)
compare_models_from_components(model_dict)

In [None]:
print_latex_table(model_dict, "Additional Reference Import")