In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
import optuna
from model_utils import run_classifier
import joblib
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv").squeeze()  # Convert to Series if needed
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv").squeeze()    # Convert to Series if needed

In [3]:
import pandas as pd

# Initialize an empty DataFrame to store model results
model_records = pd.DataFrame(columns=["Model Name", "Hyperparameters", "Test Accuracy", "Test Precision", "Test Recall"])

def record_trained_model(model_name, params, mean_cv_accuracy, test_metrics):
    """
    Record a trained model's details in a pandas DataFrame.

    Args:
        model_name (str): Name of the model.
        params (dict): Hyperparameters used for training.
        mean_cv_accuracy (float): Mean cross-validation accuracy (optional).
        test_metrics (dict): Test metrics such as accuracy, precision, and recall.
    """
    global model_records  # Use the global DataFrame

    # Create a new record
    record = {
        "Model Name": model_name,
        "Hyperparameters": params,
        "Test Accuracy": f"{test_metrics['accuracy']:.2%}",
        "Test Precision": f"{test_metrics['precision']:.2%}",
        "Test Recall": f"{test_metrics['recall']:.2%}",
    }

    # Append the record to the DataFrame
    model_records = pd.concat([model_records, pd.DataFrame([record])], ignore_index=True)

    print(f"Model '{model_name}' recorded successfully!")

In [4]:
# Default MLPClassifier
dtree_default = DecisionTreeClassifier()

print("Decision Tree with Default Parameters")
best_model_dtree_default = run_classifier(dtree_default, {}, X_train, y_train, X_test, y_test, "Default Decision Tree")

Decision Tree with Default Parameters

--- RandomizedSearchCV (Default Decision Tree) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=DecisionTreeClassifier(), param_distributions={},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Default Decision Tree) ---
The best parameters are: {}
Mean cross-validation accuracy: 42.57%

--- Test Results ---
Accuracy: 43.59%
Precision: 44.33%
Recall: 43.59%


In [5]:
# Record default parameter model performance
default_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_dtree_default.predict(X_test)),
    "precision": precision_score(y_test, best_model_dtree_default.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_dtree_default.predict(X_test), average='weighted')
}
record_trained_model("Decision Tree with Default Parameters", {}, np.nan, default_test_metrics)

Model 'Decision Tree with Default Parameters' recorded successfully!


In [6]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%


In [7]:
# Train Decision Tree with Hyperparameter Tuning
dtree_tuned = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': np.arange(1, 20, 2),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10]
}

best_model_dtree_tuned = run_classifier(dtree_tuned, param_grid, X_train, y_train, X_test, y_test, "Tuned Decision Tree")


--- RandomizedSearchCV (Tuned Decision Tree) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_leaf': [1, 2, 4, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'splitter': ['best', 'random']},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Tuned Decision Tree) ---
The best parameters are: {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 15, 'criterion': 'entropy'}
Mean cross-validation accuracy: 43.84%

--- Test Results ---
Accuracy: 43.50%
Precision: 43.34%
Recall: 43.50%


In [8]:
# Record tuned parameter model performance
tuned_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_dtree_tuned.predict(X_test)),
    "precision": precision_score(y_test, best_model_dtree_tuned.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_dtree_tuned.predict(X_test), average='weighted')
}
record_trained_model("Decision Tree with Tuned Parameters", best_model_dtree_tuned.get_params(), np.nan, tuned_test_metrics)

Model 'Decision Tree with Tuned Parameters' recorded successfully!


In [9]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%


In [10]:
# Optimize Decision Tree Parameters using Optuna
def objective(trial):
    # Define the hyperparameter search space
    criterion = trial.suggest_categorical("criterion", ['gini', 'entropy'])
    splitter = trial.suggest_categorical("splitter", ['best', 'random'])
    max_depth = trial.suggest_int("max_depth", 1, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Create the model with the sampled parameters
    model = DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=123
    )

    # Evaluate the model using cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    mean_cv_score = np.mean(cv_scores)

    return mean_cv_score

In [11]:
# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2024-12-29 19:22:30,920] A new study created in memory with name: no-name-e4c70dd1-5428-4999-b6ed-7a33d97d58c9
[I 2024-12-29 19:22:31,300] Trial 0 finished with value: 0.41894253012493154 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.41894253012493154.
[I 2024-12-29 19:22:36,055] Trial 1 finished with value: 0.4279197429518236 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 14, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.4279197429518236.
[I 2024-12-29 19:22:36,314] Trial 2 finished with value: 0.3978718653787398 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.4279197429518236.
[I 2024-12-29 19:22:36,397] Trial 3 finished with value: 0.16816040665719859 and parameters: {'criterion': 'entropy', 'splitter': 'rando

In [12]:
optuna.visualization.plot_optimization_history(study)

In [13]:
optuna.visualization.plot_slice(study)

In [14]:
# Extract the best parameters
best_params = study.best_params
print("\nBest Hyperparameters from Optuna:", best_params)


Best Hyperparameters from Optuna: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 6}


In [15]:
# Train Decision Tree with Optuna Tuned Parameters
best_model_dtree_optuna = DecisionTreeClassifier(**best_params, random_state=123)

best_model_dtree_optuna = run_classifier(dtree_tuned, param_grid, X_train, y_train, X_test, y_test, "Tuned Decision Tree")
best_model_dtree_optuna.fit(X_train, y_train)


--- RandomizedSearchCV (Tuned Decision Tree) ---
RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_leaf': [1, 2, 4, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'splitter': ['best', 'random']},
                   random_state=123, return_train_score=True,
                   scoring='accuracy')

--- Cross-Validation Results (Tuned Decision Tree) ---
The best parameters are: {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 15, 'criterion': 'entropy'}
Mean cross-validation accuracy: 44.09%

--- Test Results ---
Accuracy: 43.59%
Precision: 43.70%
Recall: 43.59%


In [16]:
# Evaluate the Optuna-tuned model
optuna_test_metrics = {
    "accuracy": accuracy_score(y_test, best_model_dtree_optuna.predict(X_test)),
    "precision": precision_score(y_test, best_model_dtree_optuna.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_model_dtree_optuna.predict(X_test), average='weighted')
}
record_trained_model("Decision Tree with Optuna Parameters", best_params, np.nan, optuna_test_metrics)

Model 'Decision Tree with Optuna Parameters' recorded successfully!


In [17]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%


In [18]:
# Phase 4: pruned decision tree
dtree_pruned = DecisionTreeClassifier(**best_params, random_state=123, ccp_alpha=0.01)
print("Training Decision Tree with Advanced Techniques (e.g., Pruning)...")
dtree_pruned.fit(X_train, y_train)

Training Decision Tree with Advanced Techniques (e.g., Pruning)...


In [19]:
# Evaluate the advanced pruned model
test_metrics = {
    "accuracy": accuracy_score(y_test, dtree_pruned.predict(X_test)),
    "precision": precision_score(y_test, dtree_pruned.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, dtree_pruned.predict(X_test), average='weighted')
}
record_trained_model("Pruned Decision Tree with Optuna Parameters", dtree_pruned.get_params(), np.nan, test_metrics)

Model 'Pruned Decision Tree with Optuna Parameters' recorded successfully!


In [20]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%


In [None]:
# Get effective alphas for pruning
path = best_model_dtree_optuna.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# Train and evaluate trees with different alphas
pruned_models = []
pruned_metrics = []

for alpha in ccp_alphas:
    pruned_model = DecisionTreeClassifier(**best_params, random_state=123, ccp_alpha=alpha)
    pruned_model.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, pruned_model.predict(X_test))
    pruned_models.append(pruned_model)
    pruned_metrics.append((alpha, accuracy))

# Select the best pruned model based on accuracy
best_alpha_idx = np.argmax([metric[1] for metric in pruned_metrics])
best_pruned_model = pruned_models[best_alpha_idx]

print(f"Best Alpha: {ccp_alphas[best_alpha_idx]}")
print(f"Best Pruned Model Accuracy: {pruned_metrics[best_alpha_idx][1]:.2f}")

In [26]:
ccp_alphas.shape

(701,)

In [32]:
len(pruned_metrics)

398

In [23]:
# Select the best pruned model based on accuracy
best_alpha_idx = np.argmax([metric[1] for metric in pruned_metrics])
best_pruned_model = pruned_models[best_alpha_idx]

print(f"Best Alpha: {ccp_alphas[best_alpha_idx]}")
print(f"Best Pruned Model Accuracy: {pruned_metrics[best_alpha_idx][1]:.2f}")

Best Alpha: 0.0
Best Pruned Model Accuracy: 0.46


In [60]:
from sklearn.metrics import precision_score, recall_score

# Calculate precision and recall for the best pruned model
precision = precision_score(y_test, best_pruned_model.predict(X_test), average='weighted')
recall = recall_score(y_test, best_pruned_model.predict(X_test), average='weighted')

# Record the performance of the best pruned model
record_trained_model(
    "Pruned Decision Tree with Alpha Optimization",
    pruned_models[best_alpha_idx].get_params(),
    ccp_alphas[best_alpha_idx],
    {
        "accuracy": pruned_metrics[best_alpha_idx][1],  # Pass accuracy as a float
        "precision": precision,                        # Add precision
        "recall": recall                               # Add recall
    }
)


Model 'Pruned Decision Tree with Alpha Optimization' recorded successfully!


In [61]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%
4,Pruned Decision Tree with Alpha Optimization,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",46.24%,45.76%,46.24%


In [24]:
best_pruned_model

In [37]:
# Define a custom stopping criterion
early_stop_model = DecisionTreeClassifier(
    **best_params, random_state=123, min_impurity_decrease=0.001  # Adjust threshold
)
early_stop_model.fit(X_train, y_train)

# Evaluate the model
early_stop_metrics = {
    "accuracy": accuracy_score(y_test, early_stop_model.predict(X_test)),
    "precision": precision_score(y_test, early_stop_model.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, early_stop_model.predict(X_test), average='weighted')
}

print("\nEarly Stopping Metrics:")
print(early_stop_metrics)



Early Stopping Metrics:
{'accuracy': 0.45897435897435895, 'precision': 0.4536332397196331, 'recall': 0.45897435897435895}


In [62]:
# Record the performance of the early stopping model
record_trained_model(
    "Decision Tree with Early Stopping",
    early_stop_model.get_params(),
    "min_impurity_decrease=0.001",
    {
        "accuracy": early_stop_metrics["accuracy"],
        "precision": early_stop_metrics["precision"],
        "recall": early_stop_metrics["recall"]
    }
)

Model 'Decision Tree with Early Stopping' recorded successfully!


In [63]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%
4,Pruned Decision Tree with Alpha Optimization,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",46.24%,45.76%,46.24%
5,Decision Tree with Early Stopping,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.90%,45.36%,45.90%


In [38]:
# Calculate feature importance
feature_importances = best_model_dtree_optuna.feature_importances_
important_features = np.where(feature_importances > np.mean(feature_importances))[0]

# Train model with only important features
# Use .iloc for positional indexing
X_train_imp = X_train.iloc[:, important_features]
X_test_imp = X_test.iloc[:, important_features]

feature_pruned_model = DecisionTreeClassifier(**best_params, random_state=123)
feature_pruned_model.fit(X_train_imp, y_train)

# Evaluate feature-pruned model
y_pred_imp = feature_pruned_model.predict(X_test_imp)

feature_pruned_metrics = {
    "accuracy": accuracy_score(y_test, y_pred_imp),
    "precision": precision_score(y_test, y_pred_imp, average='weighted'),
    "recall": recall_score(y_test, y_pred_imp, average='weighted')
}

print("\nFeature Pruning Metrics:")
print(feature_pruned_metrics)


Feature Pruning Metrics:
{'accuracy': 0.4547008547008547, 'precision': 0.4505461671572581, 'recall': 0.4547008547008547}


In [64]:
# Record the performance of the feature-pruned model
record_trained_model(
    "Decision Tree with Feature Pruning",
    feature_pruned_model.get_params(),
    f"Important Features={len(important_features)}",
    {
        "accuracy": feature_pruned_metrics["accuracy"],
        "precision": feature_pruned_metrics["precision"],
        "recall": feature_pruned_metrics["recall"]
    }
)

Model 'Decision Tree with Feature Pruning' recorded successfully!


In [65]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%
4,Pruned Decision Tree with Alpha Optimization,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",46.24%,45.76%,46.24%
5,Decision Tree with Early Stopping,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.90%,45.36%,45.90%
6,Decision Tree with Feature Pruning,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.47%,45.05%,45.47%


In [40]:
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(**best_params, random_state=123),
    n_estimators=50, random_state=123
)
bagging_model.fit(X_train, y_train)

bagging_metrics = {
    "accuracy": accuracy_score(y_test, bagging_model.predict(X_test)),
    "precision": precision_score(y_test, bagging_model.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, bagging_model.predict(X_test), average='weighted')
}

print("\nBagging Metrics:")
print(bagging_metrics)



Bagging Metrics:
{'accuracy': 0.5316239316239316, 'precision': 0.5189225937975445, 'recall': 0.5316239316239316}


In [66]:
# Record the performance of the Bagging model
record_trained_model(
    "Bagging Classifier with Decision Tree",
    bagging_model.get_params(),
    f"n_estimators=50",
    {
        "accuracy": bagging_metrics["accuracy"],
        "precision": bagging_metrics["precision"],
        "recall": bagging_metrics["recall"]
    }
)

Model 'Bagging Classifier with Decision Tree' recorded successfully!


In [67]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%
4,Pruned Decision Tree with Alpha Optimization,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",46.24%,45.76%,46.24%
5,Decision Tree with Early Stopping,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.90%,45.36%,45.90%
6,Decision Tree with Feature Pruning,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.47%,45.05%,45.47%
7,Bagging Classifier with Decision Tree,"{'base_estimator': 'deprecated', 'bootstrap': ...",55.56%,54.35%,55.56%


In [49]:
# Optimize Decision Tree and Bagging Parameters using Optuna
def objective(trial):
    # # Decision Tree Parameters
    criterion = trial.suggest_categorical("criterion", ['gini', 'entropy'])
    splitter = trial.suggest_categorical("splitter", ['best', 'random'])
    max_depth = trial.suggest_int("max_depth", 1, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Bagging Parameters
    n_estimators = trial.suggest_int("n_estimators", 10, 100)
    max_samples = trial.suggest_float("max_samples", 0.5, 1.0)
    max_features = trial.suggest_float("max_features", 0.5, 1.0)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    # Create the Decision Tree model
    base_model = DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=123
    )

    # Create the Bagging model
    bagging_model = BaggingClassifier(
        estimator=base_model,
        n_estimators=n_estimators,
        max_samples=max_samples,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=123
    )

    # Evaluate the Bagging model using cross-validation
    cv_scores = cross_val_score(bagging_model, X_train, y_train, cv=5, scoring="accuracy")
    mean_cv_score = np.mean(cv_scores)

    return mean_cv_score

In [50]:
# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2024-12-29 20:26:57,431] A new study created in memory with name: no-name-c33e4b14-84fb-4388-9b31-31732876b728
[I 2024-12-29 20:27:00,204] Trial 0 finished with value: 0.4837854171493035 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 6, 'n_estimators': 63, 'max_samples': 0.6102814798953524, 'max_features': 0.5065311242652626, 'bootstrap': True}. Best is trial 0 with value: 0.4837854171493035.
[I 2024-12-29 20:27:01,055] Trial 1 finished with value: 0.3462164630358947 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 3, 'n_estimators': 28, 'max_samples': 0.8488884749089203, 'max_features': 0.6795515552477596, 'bootstrap': True}. Best is trial 0 with value: 0.4837854171493035.
[I 2024-12-29 20:27:06,485] Trial 2 finished with value: 0.504852724421927 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 19, 'min_samples_sp

In [51]:
# Extract the best parameters
best_params = study.best_params
print("\nBest Hyperparameters for BaggingClassifier:", best_params)


Best Hyperparameters for BaggingClassifier: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 16, 'min_samples_split': 9, 'min_samples_leaf': 4, 'n_estimators': 74, 'max_samples': 0.7080360245310086, 'max_features': 0.6986074415599616, 'bootstrap': False}


In [52]:
# Train the best Bagging model
best_base_model = DecisionTreeClassifier(
    criterion=best_params['criterion'],
    splitter=best_params['splitter'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=123
)


In [53]:
best_bagging_model = BaggingClassifier(
    estimator=best_base_model,
    n_estimators=best_params['n_estimators'],
    max_samples=best_params['max_samples'],
    max_features=best_params['max_features'],
    bootstrap=best_params['bootstrap'],
    random_state=123
)
best_bagging_model.fit(X_train, y_train)

In [54]:
# Evaluate the best Bagging model
bagging_metrics = {
    "accuracy": accuracy_score(y_test, best_bagging_model.predict(X_test)),
    "precision": precision_score(y_test, best_bagging_model.predict(X_test), average='weighted'),
    "recall": recall_score(y_test, best_bagging_model.predict(X_test), average='weighted')
}

print("\nOptimized Bagging Metrics:")
print(bagging_metrics)


Optimized Bagging Metrics:
{'accuracy': 0.5555555555555556, 'precision': 0.5434761894347337, 'recall': 0.5555555555555556}


In [68]:
# Record the performance of the best Bagging model (if applicable)
record_trained_model(
    "Optimized Bagging Classifier",
    best_bagging_model.get_params(),
    "Optimized Parameters",
    {
        "accuracy": bagging_metrics["accuracy"],
        "precision": bagging_metrics["precision"],
        "recall": bagging_metrics["recall"]
    }
)

Model 'Optimized Bagging Classifier' recorded successfully!


In [69]:
model_records

Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%
4,Pruned Decision Tree with Alpha Optimization,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",46.24%,45.76%,46.24%
5,Decision Tree with Early Stopping,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.90%,45.36%,45.90%
6,Decision Tree with Feature Pruning,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.47%,45.05%,45.47%
7,Bagging Classifier with Decision Tree,"{'base_estimator': 'deprecated', 'bootstrap': ...",55.56%,54.35%,55.56%
8,Optimized Bagging Classifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",55.56%,54.35%,55.56%


In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression

# stacking_model = StackingClassifier(
#     estimators=[
#         ('dtree', DecisionTreeClassifier(**best_params, random_state=123)),
#         ('rf', RandomForestClassifier(n_estimators=50, random_state=123)),
#         ('gb', GradientBoostingClassifier(n_estimators=50, random_state=123))
#     ],
#     final_estimator=LogisticRegression()
# )
# stacking_model.fit(X_train, y_train)

# stacking_metrics = {
#     "accuracy": accuracy_score(y_test, stacking_model.predict(X_test)),
#     "precision": precision_score(y_test, stacking_model.predict(X_test), average='weighted'),
#     "recall": recall_score(y_test, stacking_model.predict(X_test), average='weighted')
# }

# print("\nStacking Metrics:")
# print(stacking_metrics)

### Evaluate and Compare Models

In [71]:
# Display Results in a Table
results_df = pd.DataFrame(model_records)
results_df.sort_values(by="Test Accuracy", ascending=False, inplace=True)
print("Model Performance Comparison:")
results_df

Model Performance Comparison:


Unnamed: 0,Model Name,Hyperparameters,Test Accuracy,Test Precision,Test Recall
7,Bagging Classifier with Decision Tree,"{'base_estimator': 'deprecated', 'bootstrap': ...",55.56%,54.35%,55.56%
8,Optimized Bagging Classifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",55.56%,54.35%,55.56%
4,Pruned Decision Tree with Alpha Optimization,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",46.24%,45.76%,46.24%
5,Decision Tree with Early Stopping,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.90%,45.36%,45.90%
6,Decision Tree with Feature Pruning,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",45.47%,45.05%,45.47%
2,Decision Tree with Optuna Parameters,"{'criterion': 'entropy', 'splitter': 'best', '...",43.93%,44.00%,43.93%
0,Decision Tree with Default Parameters,{},43.59%,44.33%,43.59%
1,Decision Tree with Tuned Parameters,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",43.50%,43.34%,43.50%
3,Pruned Decision Tree with Optuna Parameters,"{'ccp_alpha': 0.01, 'class_weight': None, 'cri...",41.45%,35.69%,41.45%


In [72]:
# Select the best model based on RMSE
best_model_info = results_df.iloc[0]
print("\nBest Model:")
best_model_info


Best Model:


Model Name                     Bagging Classifier with Decision Tree
Hyperparameters    {'base_estimator': 'deprecated', 'bootstrap': ...
Test Accuracy                                                 55.56%
Test Precision                                                54.35%
Test Recall                                                   55.56%
Name: 7, dtype: object

### Save the Best Model

In [74]:
def save_model(model, path, model_name="dtree_model.pkl"):
    """
    Save the trained model to a specified directory.

    Args:
        model: Trained model object.
        path (str): Directory path to save the model.
        model_name (str): File name for the saved model.
    """
    # Ensure the path exists
    os.makedirs(path, exist_ok=True)

    # Save the model
    file_path = os.path.join(path, model_name)
    joblib.dump(model, file_path)
    print(f"Model saved successfully at {file_path}!")

# Save the selected best model
save_model(best_bagging_model, path="../04_modelling/models/", model_name="dtree_model.pkl")

Model saved successfully at ../04_modelling/models/dtree_model.pkl!


### Desicion Tree with AutoML 

### Archive