In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "/home/ujx4ab/ondemand/WindTurbine_KL_Divergence/Data/combined_data.csv"
target_property = "gearbox_temp_bin"

filtered_df = pd.read_csv(data_path)
filtered_df.drop('timestamp', axis=1, inplace=True)

X = filtered_df.drop(target_property, axis=1)
y = filtered_df[target_property]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
# Define helper functions for Optuna
def instantiate_pipeline(trial: optuna.Trial, model) -> Pipeline:
    # Numerical imputer
    imputer = SimpleImputer(strategy=trial.suggest_categorical('impute_strategy', ['mean', 'median', 'most_frequent']))
    
    # Standard scaler
    scaler = StandardScaler()
    
    # Preprocessing pipeline for all features
    preprocessor = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler)
    ])
    
    # Combine preprocessor and model
    return Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

def instantiate_random_forest(trial: optuna.Trial) -> RandomForestClassifier:
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    return RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

def instantiate_logistic_regression(trial: optuna.Trial) -> LogisticRegression:
    C = trial.suggest_float('C', 1e-4, 10, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs'])
    return LogisticRegression(C=C, solver=solver, random_state=42)

def instantiate_knn(trial: optuna.Trial) -> KNeighborsClassifier:
    n_neighbors = trial.suggest_int('n_neighbors', 3, 20)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    return KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)

In [4]:
# Define the objective function for Optuna
def objective(trial, X, y, model_type) -> float:
    if model_type == 'random_forest':
        model = instantiate_random_forest(trial)
    elif model_type == 'logistic_regression':
        model = instantiate_logistic_regression(trial)
    elif model_type == 'knn':
        model = instantiate_knn(trial)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    pipeline = instantiate_pipeline(trial, model)
    
    # Cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(accuracy_score)  # Use accuracy for multi-class classification
    scores = cross_val_score(pipeline, X, y, scoring=scorer, cv=kf)
    return np.mean(scores)

In [5]:
# Run Optuna optimization
def optimize_hyperparameters(X, y, model_type, n_trials=50):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y, model_type), n_trials=n_trials)
    print(f"Best trial for {model_type}:")
    print(study.best_trial.params)
    return study

In [6]:
# Run optimization for each model type
study_rf = optimize_hyperparameters(X_train, y_train, 'random_forest', n_trials=15)

[I 2024-12-14 01:30:11,366] A new study created in memory with name: no-name-0180a877-fd17-48fd-9c51-c418a4811b34
[I 2024-12-14 01:35:38,013] Trial 0 finished with value: 0.7627612788059626 and parameters: {'n_estimators': 116, 'max_depth': 16, 'impute_strategy': 'median'}. Best is trial 0 with value: 0.7627612788059626.
[I 2024-12-14 01:38:55,917] Trial 1 finished with value: 0.7380804397511133 and parameters: {'n_estimators': 79, 'max_depth': 14, 'impute_strategy': 'mean'}. Best is trial 0 with value: 0.7627612788059626.
[I 2024-12-14 01:43:23,095] Trial 2 finished with value: 0.6945925017737421 and parameters: {'n_estimators': 209, 'max_depth': 6, 'impute_strategy': 'mean'}. Best is trial 0 with value: 0.7627612788059626.
[I 2024-12-14 02:00:48,273] Trial 3 finished with value: 0.9287171049023801 and parameters: {'n_estimators': 262, 'max_depth': 45, 'impute_strategy': 'mean'}. Best is trial 3 with value: 0.9287171049023801.
[I 2024-12-14 02:14:34,617] Trial 4 finished with value: 0

In [None]:
study_lr = optimize_hyperparameters(X_train, y_train, 'logistic_regression', n_trials=30)

[I 2024-12-13 22:36:44,753] A new study created in memory with name: no-name-5d56456b-6dc2-45cc-b644-e572e0322b59
[I 2024-12-13 22:37:03,951] Trial 0 finished with value: 0.6770581970209993 and parameters: {'C': 0.00043264085490664963, 'solver': 'lbfgs', 'impute_strategy': 'median'}. Best is trial 0 with value: 0.6770581970209993.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/

Best trial for logistic_regression:
{'C': 1.0974322879000786, 'solver': 'lbfgs', 'impute_strategy': 'median'}


In [None]:
study_knn = optimize_hyperparameters(X_train, y_train, 'knn', n_trials=15)

[I 2024-12-13 23:04:38,547] A new study created in memory with name: no-name-b4da92f1-cf68-4a3b-9b0a-eacf4283e266
[I 2024-12-13 23:05:15,881] Trial 0 finished with value: 0.7241785492870427 and parameters: {'n_neighbors': 6, 'weights': 'uniform', 'impute_strategy': 'median'}. Best is trial 0 with value: 0.7241785492870427.
[I 2024-12-13 23:05:57,770] Trial 1 finished with value: 0.7136534884829987 and parameters: {'n_neighbors': 13, 'weights': 'uniform', 'impute_strategy': 'most_frequent'}. Best is trial 0 with value: 0.7241785492870427.
[I 2024-12-13 23:06:28,182] Trial 2 finished with value: 0.9236428378399861 and parameters: {'n_neighbors': 18, 'weights': 'distance', 'impute_strategy': 'mean'}. Best is trial 2 with value: 0.9236428378399861.
[I 2024-12-13 23:06:59,068] Trial 3 finished with value: 0.9231452779499897 and parameters: {'n_neighbors': 17, 'weights': 'distance', 'impute_strategy': 'median'}. Best is trial 2 with value: 0.9236428378399861.
[I 2024-12-13 23:07:38,796] Tria

Best trial for knn:
{'n_neighbors': 18, 'weights': 'distance', 'impute_strategy': 'mean'}


In [None]:
# Evaluate the best models on the test set
def evaluate_model(X_train, X_test, y_train, y_test, study, model_type):
    best_params = study.best_params
    if model_type == 'random_forest':
        best_model = instantiate_random_forest(optuna.trial.FixedTrial(best_params))
    elif model_type == 'logistic_regression':
        best_model = instantiate_logistic_regression(optuna.trial.FixedTrial(best_params))
    elif model_type == 'knn':
        best_model = instantiate_knn(optuna.trial.FixedTrial(best_params))
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    pipeline = instantiate_pipeline(optuna.trial.FixedTrial(best_params), best_model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy for {model_type}: {test_accuracy}")
    return test_accuracy

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, study_rf, 'random_forest')

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, study_lr, 'logistic_regression')

In [None]:
evaluate_model(X_train, X_test, y_train, y_test, study_knn, 'knn')