In [None]:
!pip install openpyxl mlflow pandas scikit-learn==1.5.2 oracledb python-dotenv tensorflow scikeras xgboost lightgbm seaborn --proxy http://172.25.156.90:3128

In [None]:
from db_utils import connect_to_oracle, get_data_for_anomaly_type
from pipeline_utils import create_pipelines, evaluate_pipeline, evaluate_pipeline_oodd, create_pipelines_for_probs, evaluate_pipeline_from_probs
from sklearn.model_selection import train_test_split
import sys
sys.path.append('./src/oodd_detectors')
import mlflow
from continuous_OODD import OODDContinuousModel
from categorical_OODD import OODDCategoricalModel, OODDSmoothedCategoricalModel
from counter_OODD import CountBasedClassifier, FallbackCountClassifier
# needs scikit-learn version 1.5.2 (default 1.6.1 throws 'super' object has no attribute '__sklearn_tags__'.
# from scikeras.wrappers import KerasClassifier
connection, cursor = connect_to_oracle()

In [None]:
df, X_cols, y_col = get_data_for_anomaly_type(
    cursor, 'BHT_WITH_PLACE', limit=50000)

In [None]:
from plot_utils import plot_target_distribution

plot_target_distribution(df, y_col)

In [None]:
y = df[y_col].copy()
# df.drop(columns=[y_col], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.1, random_state=42)

for pipeline in create_pipelines(X_cols):
    # Evaluate the pipeline
    evaluate_pipeline(pipeline, X_train, y_train,
                      X_test, y_test, verbose=False)

In [None]:
# df, X_cols, y_col = get_data_for_anomaly_type(cursor, 'HEATING_TYPE', limit=200000)

X_train, X_test = train_test_split(df, test_size=0.1, random_state=42)
print("\nOODDContinuousModel:")
evaluate_pipeline_oodd(OODDContinuousModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical2' )
print("\nOODDCategoricalModel:")
evaluate_pipeline_oodd(OODDCategoricalModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical2')
print("\nOODDSmoothedCategoricalModel:")
evaluate_pipeline_oodd(OODDSmoothedCategoricalModel(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical2')
print("\nCountBasedClassifier:")
evaluate_pipeline_oodd(CountBasedClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical2')
print("\nFallbackCountClassifier:")
evaluate_pipeline_oodd(FallbackCountClassifier(X_cols, y_col), X_train, X_test, target_col=y_col,type='categorical2')

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

le = LabelEncoder()
y = pd.Series(le.fit_transform(df[y_col]), index=df.index, name=y_col)
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.1, random_state=42)

for pipeline in create_pipelines_for_probs(X_cols):
    print('\n' + pipeline.steps[-1][0])
    evaluate_pipeline_from_probs(pipeline, X_train, y_train,  X_test, y_test, threshold=0.5, target_col=y_col)