In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
import optuna
from model_utils import run_classifier
import joblib
import os


In [None]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv").squeeze()  # Convert to Series if needed
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv").squeeze()    # Convert to Series if needed

In [None]:
import pandas as pd

# Initialize an empty DataFrame to store model results
model_records = pd.DataFrame(columns=["Model Name", "Hyperparameters", "Test Accuracy", "Test Precision", "Test Recall"])

def record_trained_model(model_name, params, mean_cv_accuracy, test_metrics):
    """
    Record a trained model's details in a pandas DataFrame.

    Args:
        model_name (str): Name of the model.
        params (dict): Hyperparameters used for training.
        mean_cv_accuracy (float): Mean cross-validation accuracy (optional).
        test_metrics (dict): Test metrics such as accuracy, precision, and recall.
    """
    global model_records  # Use the global DataFrame

    # Create a new record
    record = {
        "Model Name": model_name,
        "Hyperparameters": params,
        "Test Accuracy": f"{test_metrics['accuracy']:.2%}",
        "Test Precision": f"{test_metrics['precision']:.2%}",
        "Test Recall": f"{test_metrics['recall']:.2%}",
    }

    # Append the record to the DataFrame
    model_records = pd.concat([model_records, pd.DataFrame([record])], ignore_index=True)

    print(f"Model '{model_name}' recorded successfully!")

In [None]:
# Default MLPClassifier
dtree_default = DecisionTreeClassifier()

print("Decision Tree with Default Parameters")
best_model_dtree_default = run_classifier(dtree_default, {}, X_train, y_train, X_test, y_test, "Default Decision Tree")

In [None]:
# Record default parameter model performance
default_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_dtree_default.predict(X_test)),
    "precision": precision_score(y_test, best_model_dtree_default.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_dtree_default.predict(X_test), average='weighted')
}
record_trained_model("Decision Tree with Default Parameters", {}, np.nan, default_test_metrics)

In [None]:
# Train Decision Tree with Hyperparameter Tuning
dtree_tuned = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': np.arange(1, 20, 2),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10]
}

best_model_dtree_tuned = run_classifier(dtree_tuned, param_grid, X_train, y_train, X_test, y_test, "Tuned Decision Tree")

In [None]:
# Record tuned parameter model performance
tuned_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_dtree_tuned.predict(X_test)),
    "precision": precision_score(y_test, best_model_dtree_tuned.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_dtree_tuned.predict(X_test), average='weighted')
}
record_trained_model("Decision Tree with Tuned Parameters", best_model_dtree_tuned.get_params(), np.nan, tuned_test_metrics)

In [None]:
# Optimize Decision Tree Parameters using Optuna
def objective(trial):
    # Define the hyperparameter search space
    criterion = trial.suggest_categorical("criterion", ['gini', 'entropy'])
    splitter = trial.suggest_categorical("splitter", ['best', 'random'])
    max_depth = trial.suggest_int("max_depth", 1, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Create the model with the sampled parameters
    model = DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=123
    )

    # Evaluate the model using cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    mean_cv_score = np.mean(cv_scores)

    return mean_cv_score

In [None]:
# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

In [None]:
# Extract the best parameters
best_params = study.best_params
print("\nBest Hyperparameters from Optuna:", best_params)

In [None]:
# Train Decision Tree with Optuna Tuned Parameters
best_model_dtree_optuna = DecisionTreeClassifier(**best_params, random_state=123)

best_model_dtree_optuna = run_classifier(dtree_tuned, param_grid, X_train, y_train, X_test, y_test, "Tuned Decision Tree")
best_model_dtree_optuna.fit(X_train, y_train)

In [None]:
# Evaluate the Optuna-tuned model
optuna_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_dtree_optuna.predict(X_test)),
    "precision": precision_score(y_test, best_model_dtree_optuna.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_dtree_optuna.predict(X_test), average='weighted')
}
record_trained_model("Decision Tree with Optuna Parameters", best_params, np.nan, optuna_test_metrics)

In [None]:
# Phase 4: pruned decision tree
dtree_pruned = DecisionTreeClassifier(**best_params, random_state=123, ccp_alpha=0.01)
print("Training Decision Tree with Advanced Techniques (e.g., Pruning)...")
dtree_pruned.fit(X_train, y_train)

In [None]:
# Evaluate the advanced pruned model
test_metrics = {
    "accuracy": accuracy_score(y_test, dtree_pruned.predict(X_test)),
    "precision": precision_score(y_test, dtree_pruned.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, dtree_pruned.predict(X_test), average='weighted')
}
record_trained_model("Pruned Decision Tree with Optuna Parameters", dtree_pruned.get_params(), np.nan, test_metrics)

### Evaluate and Compare Models

In [None]:
# Display Results in a Table
results_df = pd.DataFrame(model_records)
results_df.sort_values(by="Test Accuracy", ascending=True, inplace=True)
print("Model Performance Comparison:")
results_df

In [None]:
# Select the best model based on RMSE
best_model_info = results_df.iloc[0]
print("\nBest Model:")
best_model_info

In [None]:
print("\nEvaluate and Compare Models")
models = {
    "Default Decision Tree": dtree_default,
    "Tuned Decision Tree": dtree_tuned,
    "Optuna Tuned Desicion Tree": best_model_dtree_optuna,
    "Pruned Optuna Tuned Desicion Tree": dtree_pruned,
}

for name, model in models.items():
    print(f"\n--- Evaluating {name} ---")
    y_test_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')

    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")

### Select the Best Model

In [None]:
# Select the model with the highest accuracy
best_model_name = max(models, key=lambda name: accuracy_score(y_test, models[name].predict(X_test)))
best_model = models[best_model_name]

print(f"\nThe Best Model is '{best_model_name}'")
print(best_model)


### Save the Best Model

In [None]:
def save_model(model, path, model_name="dtree_model.pkl"):
    """
    Save the trained model to a specified directory.

    Args:
        model: Trained model object.
        path (str): Directory path to save the model.
        model_name (str): File name for the saved model.
    """
    # Ensure the path exists
    os.makedirs(path, exist_ok=True)

    # Save the model
    file_path = os.path.join(path, model_name)
    joblib.dump(model, file_path)
    print(f"Model saved successfully at {file_path}!")

# Save the selected best model
save_model(best_model, path="../04_modelling/models/", model_name="dtree_model.pkl")

### Archive

In [3]:
# # Step 1: Import Necessary Libraries
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
# from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
# from matplotlib.colors import ListedColormap
# import seaborn as sns
# import warnings; warnings.filterwarnings('ignore')

# import matplotlib.pyplot as plt
# import lightgbm as lgb
# import xgboost as xgb
# # from catboost import CatBoostRegressor

In [4]:
# # Step 2: Load and Explore Data
# Load datasets
# X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
# y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
# # X_val = pd.read_csv("../04_modelling/dataset/X_val.csv")
# # y_val = pd.read_csv("../04_modelling/dataset/y_val.csv")
# X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
# y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

In [5]:
# X_train.shape

(5459, 50)

In [6]:
# target_names = y_train['yearly_compensation']

In [7]:
# target_names = target_names.to_numpy()

In [70]:
# def run_classifier(clf, param_grid, title):
#     # -----------------------------------------------------
#     cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
#     # Randomized grid search
#     n_iter_search = 10
#     gs = RandomizedSearchCV(
#         clf,
#         param_distributions=param_grid,
#         n_iter=n_iter_search,
#         cv=cv,
#         scoring='accuracy'
#     )
#     # -----------------------------------------------------
#     # Train model
#     gs.fit(X_train, y_train)  
#     print("The best parameters are %s" % (gs.best_params_)) 
#     # Predict on test set
#     y_pred = gs.best_estimator_.predict(X_test)
#     # Get Probability estimates
#     y_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
#     # -----------------------------------------------------
#     print('Accuracy score: %.2f%%' % (accuracy_score(y_test, y_pred)*100))  
#     print('Precision score: %.2f%%' % (precision_score(y_test, y_pred, average='weighted')*100))
#     print('Recall score: %.2f%%' % (recall_score(y_test, y_pred, average='weighted')*100))
#     # -----------------------------------------------------
    # Plot confusion matrix
    # fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10, 5))
    # cm = confusion_matrix(y_test, y_pred)
    
    # # Ensure target names match unique classes
    # target_names = sorted(set(y_test))  # Unique classes
    
    # sns.heatmap(cm, annot=True, cbar=False, fmt="d", linewidths=.5, cmap="Blues", ax=ax1)
    # ax1.set_title("Confusion Matrix")
    # ax1.set_xlabel("Predicted class")
    # ax1.set_ylabel("Actual class")
    # ax1.set_xticklabels(target_names, rotation=45)
    # ax1.set_yticklabels(target_names)
    # fig.tight_layout()

In [71]:
# def run_classifier(clf, param_grid, title):
#     # -----------------------------------------------------
#     # Cross-Validation Setup
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)  # 5-fold CV for better generalization
#     n_iter_search = 10  # Number of parameter combinations for RandomizedSearch
    
#     gs = RandomizedSearchCV(
#         clf,
#         param_distributions=param_grid,
#         n_iter=n_iter_search,
#         cv=cv,
#         scoring='accuracy',
#         return_train_score=True
#     )
#     # -----------------------------------------------------
#     # Perform Cross-Validation and Hyperparameter Tuning
#     gs.fit(X_train, y_train)
#     print(f"\n--- Cross-Validation Results ({title}) ---")
#     print("The best parameters are:", gs.best_params_)
#     print("Mean cross-validation accuracy: %.2f%%" % (gs.best_score_ * 100))
    
#     # -----------------------------------------------------
#     # Evaluate Model on Test and Validation Sets
#     print("\n--- Test and Validation Results ---")
    
#     # Predict on Test Set
#     y_test_pred = gs.best_estimator_.predict(X_test)
#     y_test_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
    
#     # Predict on Validation Set
#     # y_val_pred = gs.best_estimator_.predict(X_val)
#     # y_val_prob = gs.best_estimator_.predict_proba(X_val)[:, 1]
    
#     # Test Set Metrics
#     print("\n--- Test Metrics ---")
#     print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_test_pred) * 100))
#     print('Precision: %.2f%%' % (precision_score(y_test, y_test_pred, average='weighted') * 100))
#     print('Recall: %.2f%%' % (recall_score(y_test, y_test_pred, average='weighted') * 100))
    
#     # Validation Set Metrics
#     # print("\n--- Validation Metrics ---")
#     # print('Accuracy: %.2f%%' % (accuracy_score(y_val, y_val_pred) * 100))
#     # print('Precision: %.2f%%' % (precision_score(y_val, y_val_pred, average='weighted') * 100))
#     # print('Recall: %.2f%%' % (recall_score(y_val, y_val_pred, average='weighted') * 100))
    
#     # -----------------------------------------------------
#     # Confusion Matrices (optional for analysis)
#     # print("\nConfusion Matrix (Test):")
#     # print(confusion_matrix(y_test, y_test_pred))
#     # print("\nConfusion Matrix (Validation):")
#     # print(confusion_matrix(y_val, y_val_pred))

In [12]:
# from sklearn.model_selection import RandomizedSearchCV, KFold
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import numpy as np

# def run_regressor(regressor, param_grid, title):
#     # -----------------------------------------------------
#     # Cross-Validation Setup
#     cv = KFold(n_splits=5, shuffle=True, random_state=123)  # K-Fold CV for regression
#     n_iter_search = 10  # Number of parameter combinations for RandomizedSearch

#     gs = RandomizedSearchCV(
#         regressor,
#         param_distributions=param_grid,
#         n_iter=n_iter_search,
#         cv=cv,
#         scoring='neg_root_mean_squared_error',  # For regression, use RMSE
#         return_train_score=True
#     )
#     # -----------------------------------------------------
#     # Perform Cross-Validation and Hyperparameter Tuning
#     gs.fit(X_train, y_train)
#     print(f"\n--- Cross-Validation Results ({title}) ---")
#     print("The best parameters are:", gs.best_params_)
#     print("Mean cross-validation RMSE: %.4f" % (-gs.best_score_))
    
#     # -----------------------------------------------------
#     # Evaluate Model on Test Set
#     print("\n--- Test Results ---")
    
#     # Predict on Test Set
#     y_test_pred = gs.best_estimator_.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
#     mae = mean_absolute_error(y_test, y_test_pred)
#     r2 = r2_score(y_test, y_test_pred)
#     # mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100  # Mean Absolute Percentage Error

#     print('Test RMSE: %.4f' % rmse)
#     print('Test MAE: %.4f' % mae)
#     print('Test R² (Accuracy): %.2f%%' % (r2 * 100))
#     # print('Test MAPE: %.2f%%' % mape)

In [13]:
# # Define LightGBM parameters for tuning
# param_grid_lgbm = {
#     'num_leaves': [31, 50, 70],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [100, 500, 1000],
#     'bagging_fraction': [0.5, 0.6, 0.8],
#     'feature_fraction': [0.5, 0.7, 0.9]
# }

# # Initialize LightGBM regressor
# lgbm_regressor = lgb.LGBMRegressor(
#     objective="regression",
#     metric="rmse",
#     bagging_seed=42,
#     verbosity=-1,
#     random_state=42
# )

# # Call the helper function
# run_regressor(lgbm_regressor, param_grid_lgbm, "LightGBM Regressor")


--- Cross-Validation Results (LightGBM Regressor) ---
The best parameters are: {'num_leaves': 31, 'n_estimators': 1000, 'learning_rate': 0.1, 'feature_fraction': 0.7, 'bagging_fraction': 0.8}
Mean cross-validation RMSE: 2.0086

--- Test Results ---
Test RMSE: 1.9360
Test MAE: 1.4020
Test R² (Accuracy): 93.47%


In [72]:
# from sklearn.linear_model import LogisticRegression

# lr = LogisticRegression()

# param_grid_lr = {'penalty': ['l2'],
#               'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

# run_classifier(lr, param_grid_lr, 'Logistic Regression')


--- Cross-Validation Results (Logistic Regression) ---
The best parameters are: {'solver': 'lbfgs', 'penalty': 'l2'}
Mean cross-validation accuracy: 70.88%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 73.97%
Precision: 72.85%
Recall: 73.97%


In [73]:
# from sklearn.tree import DecisionTreeClassifier
# import numpy as np

# dtree = DecisionTreeClassifier()

# param_grid_dtree = {'criterion': ['gini', 'entropy'],
#               'splitter': ['best', 'random'],
#               'max_depth': np.arange(1, 20, 2),
#               'min_samples_split': [2, 5, 10],
#               'min_samples_leaf': [1, 2, 4, 10],
#               'max_features': ['auto', 'sqrt', 'log2', None]}

# run_classifier(dtree, param_grid_dtree, "Decision Tree")


--- Cross-Validation Results (Decision Tree) ---
The best parameters are: {'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 11, 'criterion': 'gini'}
Mean cross-validation accuracy: 42.09%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 41.86%
Precision: 40.10%
Recall: 41.86%


In [None]:
# dtree_2 = DecisionTreeClassifier()

# param_grid_dtree_2 = {
#     'criterion': ['gini', 'entropy', 'log_loss'],  # Include 'log_loss' for classification
#     'splitter': ['best', 'random'],
#     'max_depth': np.arange(1, 50, 5),  # Increase depth range
#     'min_samples_split': [2, 5, 10, 20],  # Larger values for more generalization
#     'min_samples_leaf': [1, 2, 4, 10, 20],  # Larger leaves for pruning
#     'max_features': ['auto', 'sqrt', 'log2', None],  # Adjust based on dataset size
#     'class_weight': [None, 'balanced']  # Try balancing class weights
# }

# run_classifier(dtree, param_grid_dtree_2, "Decision Tree")


--- Cross-Validation Results (Decision Tree) ---
The best parameters are: {'splitter': 'random', 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 21, 'criterion': 'gini', 'class_weight': 'balanced'}
Mean cross-validation accuracy: 37.44%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 36.35%
Precision: 42.44%
Recall: 36.35%


In [75]:
# dtree_3 = DecisionTreeClassifier()

# param_grid_dtree_3 = {
#     'criterion': ['gini', 'entropy', 'log_loss'],
#     'splitter': ['best', 'random'],
#     'max_depth': np.arange(1, 30, 2),  # Extend range
#     'min_samples_split': [2, 5, 10, 20, 50],  # Include larger splits
#     'min_samples_leaf': [1, 2, 4, 10, 20],  # Include larger leaf sizes
#     'max_features': [None, 'sqrt', 'log2'],
#     'class_weight': [None, 'balanced'],  # Try balancing classes
# }

# run_classifier(dtree_3, param_grid_dtree_3, "Decision Tree")


--- Cross-Validation Results (Decision Tree) ---
The best parameters are: {'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 27, 'criterion': 'entropy', 'class_weight': 'balanced'}
Mean cross-validation accuracy: 42.25%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 44.87%
Precision: 46.58%
Recall: 44.87%


In [76]:
dtree_3

In [None]:
# from sklearn import tree
# target_names = sorted(set(y_test))
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(dtree_3,
#                    feature_names=X_train.columns,
#                    class_names=target_names,
#                    filled=True)

In [None]:
# dtree_4 = DecisionTreeClassifier()

# param_grid_dtree_4 = {
#     'criterion': ['gini', 'entropy', 'log_loss'],  # Different impurity measures
#     'splitter': ['best', 'random'],  # Best or random split selection
#     'max_depth': [2, 3, 5, 10, 20],  # Fix range definition
#     'min_samples_split': [2, 5, 10, 20, 50, 100],  # Extended to larger values for robustness
#     'min_samples_leaf': [1, 5, 10, 20, 50, 100],  # Include smaller leaf sizes for granular splits
#     'max_features': [None, 'sqrt', 'log2'],  # Different feature selection strategies
#     'class_weight': [None, 'balanced'],  # Account for imbalanced classes
# }

# run_classifier(dtree_4, param_grid_dtree_4, "Decision Tree")

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier()

# param_grid_rf = {'n_estimators': [100, 200],
#               'max_depth': [10, 20, 100, None],
#               'max_features': ['auto', 'sqrt', None],
#               'min_samples_split': [2, 5, 10],
#               'min_samples_leaf': [1, 2, 4, 10],
#               'bootstrap': [True, False],
#               'criterion': ['gini', 'entropy']}

# run_classifier(rf, param_grid_rf, 'Random Forest')

In [None]:
# from sklearn.neural_network import MLPClassifier

# mlp = MLPClassifier()

# param_grid = {'hidden_layer_sizes': [(10,), (50,), (10, 10), (50, 50)],
#              'activation': ['identity', 'logistic', 'tanh', 'relu'],
#              'solver': ['lbfgs', 'sgd', 'adam'],
#              'alpha': np.logspace(-5, 3, 5),
#              'learning_rate': ['constant', 'invscaling','adaptive'],
#              'max_iter': [100, 500, 1000]}

# run_classifier(mlp, param_grid, 'Neural Net')

In [None]:
# # Import Necessary Libraries
# import pandas as pd
# import matplotlib.pyplot as plt  # For plotting
# import h2o
# from h2o.grid.grid_search import H2OGridSearch
# from sklearn.metrics import accuracy_score
# import optuna  # For hyperparameter optimization

# # Initialize H2O cluster
# h2o.init()

# # Step 1: Load and Prepare Data
# X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
# y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
# X_val = pd.read_csv("../04_modelling/dataset/X_val.csv")
# y_val = pd.read_csv("../04_modelling/dataset/y_val.csv")
# X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
# y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

# train_df = pd.concat([X_train, y_train], axis=1)
# val_df = pd.concat([X_val, y_val], axis=1)
# test_df = pd.concat([X_test, y_test], axis=1)

# train_h2o = h2o.H2OFrame(train_df)
# val_h2o = h2o.H2OFrame(val_df)
# test_h2o = h2o.H2OFrame(test_df)

# # Define target and features
# target = "yearly_compensation"
# features = train_h2o.columns
# if target in features:
#     features.remove(target)

# # Step 2: Train Default Deep Learning Model
# default_dl_model = h2o.estimators.H2ODeepLearningEstimator(seed=42)
# default_dl_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Evaluate Default Model
# default_performance = default_dl_model.model_performance(test_h2o)
# print("Default Deep Learning Model Performance:")
# print(default_performance)

# # Step 3: Hyperparameter Optimization using Optuna
# def objective(trial):
#     # Define hyperparameter search space
#     params = {
#         "epochs": trial.suggest_int("epochs", 10, 100),
#         "hidden": trial.suggest_categorical("hidden", [[50, 50], [100, 100], [200, 200]]),
#         "input_dropout_ratio": trial.suggest_uniform("input_dropout_ratio", 0.0, 0.5),
#         "l1": trial.suggest_loguniform("l1", 1e-6, 1e-3),
#         "l2": trial.suggest_loguniform("l2", 1e-6, 1e-3),
#         "activation": trial.suggest_categorical("activation", ["Rectifier", "Tanh", "Maxout"]),
#     }
    
#     # Train Deep Learning Model with hyperparameters
#     model = h2o.estimators.H2ODeepLearningEstimator(**params, seed=42)
#     model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
    
#     # Get validation performance (use RMSE as optimization target)
#     performance = model.model_performance(val_h2o)
#     return performance.rmse()

# # Perform hyperparameter optimization
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=20)

# # Best Hyperparameters
# best_params = study.best_params
# print("Best Hyperparameters:")
# print(best_params)

# # Step 4: Train Deep Learning Model with Best Hyperparameters
# tuned_dl_model = h2o.estimators.H2ODeepLearningEstimator(**best_params, seed=42)
# tuned_dl_model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Evaluate Tuned Model
# tuned_performance = tuned_dl_model.model_performance(test_h2o)
# print("Tuned Deep Learning Model Performance:")
# print(tuned_performance)

# # Step 5: Compare Predictions with Actual
# predictions = tuned_dl_model.predict(test_h2o)
# actual_values = y_test.to_numpy().ravel()
# rounded_predictions = predictions.as_data_frame().to_numpy().ravel().round().astype(int)

# accuracy = accuracy_score(actual_values, rounded_predictions)
# print(f"Accuracy on Test Data: {accuracy:.2f}")

# # Step 6: Save the Tuned Model
# best_model_path = h2o.save_model(tuned_dl_model, path="../04_modelling/models/")
# print(f"Tuned model saved to: {best_model_path}")

# # Shutdown H2O Cluster
# h2o.cluster().shutdown()


In [None]:
# # Grid Search for Hyperparameter Tuning
# hyper_params = {
#     "epochs": [50, 100, 150],
#     "hidden": [[50, 50], [100, 100], [200, 200]],
#     "input_dropout_ratio": [0.0, 0.2, 0.4],
#     "l1": [1e-5, 1e-4, 1e-3],
#     "l2": [1e-5, 1e-4, 1e-3],
#     "activation": ["Rectifier", "Tanh", "Maxout"]
# }

# grid_search = H2OGridSearch(
#     H2ODeepLearningEstimator(seed=42),
#     hyper_params=hyper_params
# )

# # Train models with grid search
# grid_search.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Get the best model from the grid search
# best_model = grid_search.get_grid(sort_by="rmse", decreasing=False).models[0]
# print("Best Grid Search Model:")
# print(best_model)

# # Evaluate the best grid search model
# performance = best_model.model_performance(test_h2o)
# print("Best Grid Search Model Performance:")
# print(performance)


In [None]:
# # Random Search for Hyperparameter Tuning
# from h2o.grid.grid_search import H2ORandomGridSearch

# hyper_params = {
#     "epochs": list(range(50, 201, 50)),
#     "hidden": [[50, 50], [100, 100], [200, 200]],
#     "input_dropout_ratio": [i / 10.0 for i in range(0, 6)],
#     "l1": [1e-6, 1e-5, 1e-4],
#     "l2": [1e-6, 1e-5, 1e-4],
#     "activation": ["Rectifier", "Tanh", "Maxout"]
# }

# random_search = H2ORandomGridSearch(
#     H2ODeepLearningEstimator(seed=42),
#     hyper_params=hyper_params,
#     max_models=20  # Limit the number of models
# )

# # Train models with random search
# random_search.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Get the best model from the random search
# best_random_model = random_search.get_grid(sort_by="rmse", decreasing=False).models[0]
# print("Best Random Search Model:")
# print(best_random_model)

# # Evaluate the best random search model
# performance = best_random_model.model_performance(test_h2o)
# print("Best Random Search Model Performance:")
# print(performance)


In [None]:
# # Early Stopping with Default Parameters
# model_with_early_stopping = h2o.estimators.H2ODeepLearningEstimator(
#     stopping_metric="rmse",
#     stopping_rounds=5,   # Stop if no improvement in 5 rounds
#     stopping_tolerance=0.01,
#     epochs=200,
#     seed=42
# )

# # Train the model
# model_with_early_stopping.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)

# # Evaluate the model
# performance = model_with_early_stopping.model_performance(test_h2o)
# print("Model Performance with Early Stopping:")
# print(performance)


In [None]:
# # Train Multiple Models for Ensembling
# models = []
# for hidden in [[50, 50], [100, 100], [200, 200]]:
#     model = h2o.estimators.H2ODeepLearningEstimator(
#         hidden=hidden,
#         epochs=100,
#         seed=42
#     )
#     model.train(x=features, y=target, training_frame=train_h2o, validation_frame=val_h2o)
#     models.append(model)

# # Ensemble Predictions (Averaging)
# predictions = [model.predict(test_h2o).as_data_frame().to_numpy().ravel() for model in models]
# ensemble_prediction = sum(predictions) / len(predictions)

# # Evaluate Ensemble
# from sklearn.metrics import root_mean_squared_error 
# rmse_ensemble = root_mean_squared_error(y_test.to_numpy().ravel(), ensemble_prediction, squared=False)
# print(f"RMSE for Ensemble: {rmse_ensemble:.2f}")
