# Gaussian Naive Bayes

## With Albedos

In [1]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns + ['pV']]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune the variance smoothing parameter of GaussianNB.
#    Values range logarithmically from 1e-10 to 1e-6.
param_grid = {'var_smoothing': np.logspace(-12, -6, num=2)}
gnb = GaussianNB()
# Use all CPUs by setting n_jobs=-1
grid_search = GridSearchCV(gnb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final GaussianNB model with the best parameter.
best_gnb = GaussianNB(**best_params)
best_gnb.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_gnb.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_gnb, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by total number of samples to get percentages.
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_gnb.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_gnb.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-GNB-A1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters (variance smoothing):
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    # Create a table that shows only the Mean and Std for each metric.
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time  
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'var_smoothing': 1e-12}
Best cross-validation score: 0.6701 ± 0.0134
Test Accuracy: 0.6599 (95% CI: 0.6216 - 0.6982)
Classification Report:
               precision    recall  f1-score   support

           A       0.32      1.00      0.49        11
           B       0.45      0.83      0.59        12
           C       0.75      0.71      0.73        58
          Ch       0.50      0.36      0.42        33
           D       0.73      0.86      0.79        22
           E       0.41      0.92      0.57        13
           K       0.19      0.54      0.29        13
           L       0.38      0.71      0.50        14
           M       0.62      0.55      0.58        55
           P       0.69      0.31      0.42        36
           Q       0.56      0.90      0.69        30
           R       0.00      0.00      0.00         3
           S       0.97      0.66      0.78       240
           V       0.97      0.88      0.92        41
           Z

  f = msb / msw


PDF report has been saved as '03-GNB-A1.pdf'.


## Without Albedos

In [2]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune the variance smoothing parameter of GaussianNB.
#    Values range logarithmically from 1e-10 to 1e-6.
param_grid = {'var_smoothing': np.logspace(-12, -6, num=2)}
gnb = GaussianNB()
# Use all CPUs by setting n_jobs=-1
grid_search = GridSearchCV(gnb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final GaussianNB model with the best parameter.
best_gnb = GaussianNB(**best_params)
best_gnb.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_gnb.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_gnb, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by total number of samples to get percentages.
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_gnb.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_gnb.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-GNB1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters (variance smoothing):
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    # Create a table that shows only the Mean and Std for each metric.
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time  
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'var_smoothing': 1e-12}
Best cross-validation score: 0.6378 ± 0.0134
Test Accuracy: 0.6310 (95% CI: 0.5919 - 0.6700)
Classification Report:
               precision    recall  f1-score   support

           A       0.32      1.00      0.49        11
           B       0.45      0.83      0.59        12
           C       0.73      0.71      0.72        58
          Ch       0.52      0.36      0.43        33
           D       0.70      0.86      0.78        22
           E       0.21      0.54      0.30        13
           K       0.16      0.46      0.24        13
           L       0.37      0.71      0.49        14
           M       0.58      0.47      0.52        55
           P       0.36      0.14      0.20        36
           Q       0.56      0.90      0.69        30
           R       0.00      0.00      0.00         3
           S       0.97      0.65      0.78       240
           V       0.97      0.88      0.92        41
           Z

  f = msb / msw


PDF report has been saved as '03-GNB1.pdf'.


# MultiLayer Perceptron

## With Albedos

In [5]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns + ['pV']]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for Multilayer Perceptron.
# Hyperparameter grid: hidden_layer_sizes, learning_rate_init, solver, max_iter.
param_grid = {
    'hidden_layer_sizes': [(32, 32), (64, 64), (32, 32, 32), (64, 64, 64)],
    'learning_rate_init': [0.01, 0.05, 0.1],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000, 2500, 5000]
}
mlp = MLPClassifier(random_state=42, activation='relu')
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final Multilayer Perceptron model with the best parameters.
best_mlp = MLPClassifier(random_state=42, **best_params, activation='relu', tol=1e-3)
best_mlp.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_mlp.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_mlp, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by total number of samples to get percentages.
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_mlp.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_mlp.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-MLP-A1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'hidden_layer_sizes': (64, 64), 'learning_rate_init': 0.05, 'max_iter': 1000, 'solver': 'sgd'}
Best cross-validation score: 0.8861 ± 0.0097
Test Accuracy: 0.8690 (95% CI: 0.8418 - 0.8963)
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.91      0.95        11
           B       0.67      0.83      0.74        12
           C       0.85      0.88      0.86        58
          Ch       0.76      0.76      0.76        33
           D       0.79      0.86      0.83        22
           E       0.81      1.00      0.90        13
           K       0.91      0.77      0.83        13
           L       0.65      0.79      0.71        14
           M       0.90      0.64      0.74        55
           P       0.74      0.72      0.73        36
           Q       0.93      0.83      0.88        30
           R       0.00      0.00      0.00         3
           S       0.92      0.96      0.94      

  f = msb / msw


PDF report has been saved as '03-MLP-A1.pdf'.


## Without Albedos

In [None]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for Multilayer Perceptron.
# Hyperparameter grid: hidden_layer_sizes, learning_rate_init, solver, max_iter.
param_grid = {
    'hidden_layer_sizes': [(32, 32), (64, 64), (32, 32, 32), (64, 64, 64)],
    'learning_rate_init': [0.01, 0.05, 0.1],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000, 2500, 5000]
}
mlp = MLPClassifier(random_state=42, activation='relu')
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final Multilayer Perceptron model with the best parameters.
best_mlp = MLPClassifier(random_state=42, **best_params, activation='relu', tol=1e-3)
best_mlp.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_mlp.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_mlp, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by total number of samples to get percentages.
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_mlp.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_mlp.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-MLP1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

# Multinomial Logistic Regression

## With Albedo

In [1]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns + ['pV']]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for multinomial logistic regression.
C_values = list(range(5, 61, 5)) + list(np.logspace(-4, 4, 20))
param_grid = {
    'C': C_values,
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],
    'max_iter': [10000, 25000, 50000]
}

# Removed multi_class parameter and increased max_iter to 5000.
logreg = LogisticRegression(random_state=42)
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final multinomial logistic regression model with the best parameters.
# Removed multi_class parameter and increased max_iter to 5000.
best_logreg = LogisticRegression(random_state=42, **best_params)
best_logreg.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_logreg.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_logreg, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by row sums so that each actual label sums to 100%
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_logreg.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_logreg.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-MLR-A1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")


Best parameters from GridSearchCV: {'C': 5, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'saga'}
Best cross-validation score: 0.8844 ± 0.0080
Test Accuracy: 0.8673 (95% CI: 0.8399 - 0.8948)
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.91      0.95        11
           B       1.00      0.75      0.86        12
           C       0.82      0.93      0.87        58
          Ch       0.84      0.79      0.81        33
           D       0.86      0.86      0.86        22
           E       0.92      0.92      0.92        13
           K       0.78      0.54      0.64        13
           L       0.50      0.57      0.53        14
           M       0.79      0.67      0.73        55
           P       0.74      0.81      0.77        36
           Q       0.75      0.80      0.77        30
           R       1.00      0.33      0.50         3
           S       0.93      0.95      0.94       240
           V       0.98     

  f = msb / msw


PDF report has been saved as '03-MLR-A1.pdf'.


## Without Albedo

In [3]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for multinomial logistic regression.
C_values = list(range(5, 61, 5)) + list(np.logspace(-4, 4, 20))
param_grid = {
    'C': C_values,
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],
    'max_iter': [10000, 25000, 50000]
}

# Removed multi_class parameter and increased max_iter to 5000.
logreg = LogisticRegression(random_state=42)
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final multinomial logistic regression model with the best parameters.
# Removed multi_class parameter and increased max_iter to 5000.
best_logreg = LogisticRegression(random_state=42, **best_params)
best_logreg.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_logreg.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_logreg, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by row sums so that each actual label sums to 100%
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_logreg.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_logreg.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-MLR1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'C': 4.281332398719396, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'saga'}
Best cross-validation score: 0.8457 ± 0.0104


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy: 0.8452 (95% CI: 0.8160 - 0.8745)
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.91      0.95        11
           B       0.89      0.67      0.76        12
           C       0.78      0.91      0.84        58
          Ch       0.86      0.76      0.81        33
           D       0.86      0.86      0.86        22
           E       0.68      1.00      0.81        13
           K       0.67      0.46      0.55        13
           L       0.53      0.64      0.58        14
           M       0.67      0.60      0.63        55
           P       0.73      0.53      0.61        36
           Q       0.75      0.80      0.77        30
           R       0.00      0.00      0.00         3
           S       0.93      0.96      0.94       240
           V       0.95      0.98      0.96        41
           Z       1.00      1.00      1.00         7

    accuracy                           0.85       588
   macro

  f = msb / msw


PDF report has been saved as '03-MLR1.pdf'.


# Random Forest

## With Albedo

In [1]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns + ['pV']]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for Random Forest.
param_grid = {
    'n_estimators': [10, 20, 50, 100],  # Number of trees
    'max_depth': [3, 5, 7, 9, 15],  # Tree depth
    'min_samples_split': [15, 20, 30, 40],  # Minimum samples to split
    'min_samples_leaf': [10, 15, 20, 30],  # Minimum samples per leaf
    'max_features': ['sqrt', 'log2', 0.5, None]  # Feature selection method
}

rf = RandomForestClassifier(random_state=42)
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final Random Forest model with the best parameters.
best_rf = RandomForestClassifier(random_state=42, **best_params)
best_rf.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_rf.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred, zero_division=0)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_rf, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by row sums so that each actual label sums to 100%
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_rf.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_rf.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-RF-A1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters from GridSearchCV: {'max_depth': 9, 'max_features': 0.5, 'min_samples_leaf': 10, 'min_samples_split': 15, 'n_estimators': 50}
Best cross-validation score: 0.8571 ± 0.0061
Test Accuracy: 0.8350 (95% CI: 0.8050 - 0.8650)
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.82      0.90        11
           B       0.69      0.75      0.72        12
           C       0.79      0.90      0.84        58
          Ch       0.83      0.61      0.70        33
           D       0.67      0.82      0.73        22
           E       1.00      0.92      0.96        13
           K       0.71      0.38      0.50        13
           L       0.57      0.29      0.38        14
           M       0.79      0.69      0.74        55
           P       0.74      0.78      0.76        36
           Q       0.77      0.90      0.83        30
           R       0.00      0.00      0.00         3
           S       0.88      0.97     

  f = msb / msw


PDF report has been saved as '03-RF-A1.pdf'.


## Without Albedo

In [3]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for Random Forest.
param_grid = {
    'n_estimators': [10, 20, 50, 100],  # Number of trees
    'max_depth': [3, 5, 7, 9, 15],  # Tree depth
    'min_samples_split': [15, 20, 30, 40],  # Minimum samples to split
    'min_samples_leaf': [10, 15, 20, 30],  # Minimum samples per leaf
    'max_features': ['sqrt', 'log2', 0.5, None]  # Feature selection method
}

rf = RandomForestClassifier(random_state=42)
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final Random Forest model with the best parameters.
best_rf = RandomForestClassifier(random_state=42, **best_params)
best_rf.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_rf.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred, zero_division=0)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_rf, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by row sums so that each actual label sums to 100%
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_rf.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_rf.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-RF1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time and Total Models Trained.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 15, 'n_estimators': 50}
Best cross-validation score: 0.8112 ± 0.0086
Test Accuracy: 0.8010 (95% CI: 0.7688 - 0.8333)
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.82      0.90        11
           B       0.69      0.75      0.72        12
           C       0.79      0.86      0.83        58
          Ch       0.84      0.64      0.72        33
           D       0.70      0.73      0.71        22
           E       0.75      0.46      0.57        13
           K       0.40      0.31      0.35        13
           L       0.50      0.29      0.36        14
           M       0.62      0.67      0.64        55
           P       0.62      0.42      0.50        36
           Q       0.76      0.83      0.79        30
           R       0.00      0.00      0.00         3
           S       0.87      0.97 

  f = msb / msw


PDF report has been saved as '03-RF1.pdf'.


# Support Vector Machines

## With Albedo

In [5]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns + ['pV']]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for SVM.
# Hyperparameter grid: kernel (linear or rbf), regularization parameter 'C' (6 to 24),
# and for the RBF kernel, gamma as 'scale' or 'auto'.
param_grid = [
    {'kernel': ['linear'], 'C': list(range(6, 25))},
    {'kernel': ['rbf'], 'C': list(range(6, 25)), 'gamma': ['scale', 'auto']}
]
svc = SVC(random_state=42)
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final SVM model with the best parameters.
best_svm = SVC(random_state=42, **best_params)
best_svm.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_svm.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_pred)

# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Balanced Accuracy: {:.4f}".format(test_balanced_accuracy))
print("F1 Score (weighted): {:.4f}".format(test_f1))
print("MCC: {:.4f}".format(test_mcc))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_svm, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by row sums so that each actual label sums to 100%
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_svm.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_svm.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-SVM-A1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Balanced Accuracy: {test_balanced_accuracy:.4f}
F1 Score (weighted): {test_f1:.4f}
MCC: {test_mcc:.4f}

Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'C': 19, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation score: 0.8929 ± 0.0043
Test Accuracy: 0.8810 (95% CI: 0.8548 - 0.9071)
Balanced Accuracy: 0.8007
F1 Score (weighted): 0.8794
MCC: 0.8507
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.91      0.95        11
           B       1.00      0.67      0.80        12
           C       0.82      0.95      0.88        58
          Ch       0.83      0.76      0.79        33
           D       0.83      0.91      0.87        22
           E       0.87      1.00      0.93        13
           K       0.77      0.77      0.77        13
           L       0.62      0.71      0.67        14
           M       0.84      0.69      0.76        55
           P       0.81      0.81      0.81        36
           Q       0.74      0.87      0.80        30
           R       1.00      0.33      0.50         3
           S       0.94      0.95  

  f = msb / msw


PDF report has been saved as '03-SVM-A1.pdf'.


## Without Albedo

In [6]:
import time  # Added to measure processing time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             matthews_corrcoef, confusion_matrix, classification_report)
from sklearn.feature_selection import f_classif

# ----------------------------
# Set global font settings
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20
# ----------------------------

# Start timer
start_time = time.time()  # Record the start time of the process

# 1. Load the merged database and prepare features/target.
df = pd.read_csv('05-BaseNew1.csv')
df = df.drop(columns=['counts', 'class_bdm'])

# Exclude 'pV' (albedo) and 'name'; target column is 'class_asteroid_sf'
spectra_columns = [col for col in df.columns if col not in ['pV', 'name', 'class_asteroid_sf']]
X = df[spectra_columns]
y = df['class_asteroid_sf']
names = df['name']

# 2. Split data into training (80%) and test (20%) sets (stratified by target)
X_train, X_test, y_train, y_test, name_train, name_test = train_test_split(
    X, y, names, test_size=0.2, stratify=y, random_state=42
)

# 3. Standardize features (fit on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Tune hyperparameters for SVM.
param_grid = [
    {'kernel': ['linear'], 'C': list(range(6, 25)) + list(np.logspace(-4, 4, 20))},
    {'kernel': ['rbf'], 'C': list(range(6, 25)) + list(np.logspace(-4, 4, 20)), 'gamma': ['scale', 'auto']}
]

svc = SVC(random_state=42)
# Use all CPUs by setting n_jobs=-1 in GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_
# Retrieve the CV standard deviation for the best parameter:
idx = np.where(grid_search.cv_results_['mean_test_score'] == best_cv_score)[0][0]
cv_std = grid_search.cv_results_['std_test_score'][idx]
print("Best parameters from GridSearchCV:", best_params)
print("Best cross-validation score: {:.4f} ± {:.4f}".format(best_cv_score, cv_std))

# 5. Train final SVM model with the best parameters.
best_svm = SVC(random_state=42, **best_params)
best_svm.fit(X_train_scaled, y_train)

# 6. Evaluate the model on the test set.
y_pred = best_svm.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_pred)

# Compute 95% confidence interval for accuracy using the binomial approximation:
n_test = len(y_test)
se = math.sqrt(test_accuracy * (1 - test_accuracy) / n_test)
ci_lower = test_accuracy - 1.96 * se
ci_upper = test_accuracy + 1.96 * se

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y))
class_report = classification_report(y_test, y_pred)
print("Test Accuracy: {:.4f} (95% CI: {:.4f} - {:.4f})".format(test_accuracy, ci_lower, ci_upper))
print("Balanced Accuracy: {:.4f}".format(test_balanced_accuracy))
print("F1 Score (weighted): {:.4f}".format(test_f1))
print("MCC: {:.4f}".format(test_mcc))
print("Classification Report:\n", class_report)

# 7. Generate learning curves (with error bands).
# Use all CPUs by setting n_jobs=-1
train_sizes, train_scores, val_scores = learning_curve(
    best_svm, X_train_scaled, y_train, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# 8. Compute a proxy for feature importance using ANOVA F-statistic.
f_values, p_values = f_classif(X_train_scaled, y_train)
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'f_value': f_values,
    'p_value': p_values
}).sort_values(by='f_value', ascending=False)

# 9. Identify misclassified examples (including asteroid names).
misclassified_df = pd.DataFrame({
    'name': name_test[y_test != y_pred],
    'true_label': y_test[y_test != y_pred],
    'predicted_label': pd.Series(y_pred, index=y_test.index)[y_test != y_pred]
})

# 10. Bootstrap function to compute confusion matrix percentages.
def bootstrap_confusion_matrix(y_true, y_pred, iterations=1000):
    unique_labels = np.unique(y_true)
    n = len(y_true)
    cm_list = []
    for _ in range(iterations):
        indices = np.random.choice(np.arange(n), size=n, replace=True)
        cm = confusion_matrix(y_true.iloc[indices],
                              pd.Series(y_pred, index=y_true.index).iloc[indices],
                              labels=unique_labels)
        # Normalize by row sums so that each actual label sums to 100%
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # avoid division by zero
        cm_percent = (cm / row_sums) * 100
        cm_list.append(cm_percent)
    cm_array = np.stack(cm_list, axis=0)
    cm_mean = np.mean(cm_array, axis=0)
    return unique_labels, cm_mean

unique_labels, cm_mean = bootstrap_confusion_matrix(y_test, y_pred, iterations=1000)

# 11. Perform 10-run cross validation on the training set to compute aggregated metrics.
n_runs = 10
metrics_list = []
for run in range(n_runs):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    run_acc, run_bacc, run_f1, run_mcc = [], [], [], []
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        best_svm.fit(X_cv_train, y_cv_train)
        y_cv_pred = best_svm.predict(X_cv_val)
        run_acc.append(accuracy_score(y_cv_val, y_cv_pred))
        run_bacc.append(balanced_accuracy_score(y_cv_val, y_cv_pred))
        run_f1.append(f1_score(y_cv_val, y_cv_pred, average='weighted'))
        run_mcc.append(matthews_corrcoef(y_cv_val, y_cv_pred))
    metrics_list.append({
        'Accuracy': np.mean(run_acc),
        'Balanced Accuracy': np.mean(run_bacc),
        'F1': np.mean(run_f1),
        'MCC': np.mean(run_mcc)
    })

df_cv = pd.DataFrame(metrics_list)
agg_metrics = df_cv.agg(['mean', 'std']).T.round(4)

# 12. Create a multi-page PDF report.
pdf_filename = "03-SVM1.pdf"
with PdfPages(pdf_filename) as pdf:
    
    # Page 1: Summary of results and best parameters.
    fig1 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    summary_text = f"""Model Evaluation Summary

Test Accuracy: {test_accuracy:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})
Balanced Accuracy: {test_balanced_accuracy:.4f}
F1 Score (weighted): {test_f1:.4f}
MCC: {test_mcc:.4f}

Best Cross-Validation Score: {best_cv_score:.4f} ± {cv_std:.4f}

Best Parameters:
{best_params}

Classification Report:
{class_report}
"""
    plt.text(0.05, 0.95, summary_text, verticalalignment='top', wrap=True)
    pdf.savefig(fig1)
    plt.close(fig1)
    
    # Page 2: Confusion Matrix in percentages.
    fig2, ax2 = plt.subplots(figsize=(20, 16))
    annot = np.empty_like(cm_mean).astype(str)
    for i in range(cm_mean.shape[0]):
        for j in range(cm_mean.shape[1]):
            annot[i, j] = f"{cm_mean[i, j]:.1f}%"
    sns.heatmap(cm_mean, annot=annot, fmt="", cmap="Blues", ax=ax2,
                xticklabels=unique_labels, yticklabels=unique_labels,
                annot_kws={"size": 17}, vmin=0, vmax=100,
                cbar_kws={'ticks': np.linspace(0, 100, 11), 'format': '%.0f%%'})
    ax2.set_title("Confusion Matrix (in %)", fontsize=28)
    ax2.set_xlabel("Predicted Label", fontsize=24)
    ax2.set_ylabel("True Label", fontsize=24)
    pdf.savefig(fig2)
    plt.close(fig2)
    
    # Page 3: Learning Curves with error bands.
    fig3 = plt.figure(figsize=(16, 12))
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation Score")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.2)
    plt.title("Learning Curves", fontsize=28)
    plt.xlabel("Number of Training Examples", fontsize=24)
    plt.ylabel("Accuracy", fontsize=24)
    plt.legend(loc="best", fontsize=20)
    pdf.savefig(fig3)
    plt.close(fig3)
    
    # Page 4: Feature Importances (F-statistic proxy).
    fig4 = plt.figure(figsize=(16, 12))
    plt.bar(feature_importances_df['feature'], feature_importances_df['f_value'])
    plt.xticks(rotation=90)
    plt.title("Feature Importances (F-statistic)", fontsize=28)
    plt.tight_layout()
    pdf.savefig(fig4)
    plt.close(fig4)
    
    # Page 5: Table of Misclassified Examples.
    fig5, ax5 = plt.subplots(figsize=(16, 12))
    ax5.axis('tight')
    ax5.axis('off')
    table = ax5.table(cellText=misclassified_df.values,
                      colLabels=misclassified_df.columns,
                      loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax5.set_title("Misclassified Examples", fontsize=28)
    pdf.savefig(fig5)
    plt.close(fig5)
    
    # Page 6: Aggregated Cross Validation Metrics.
    fig6, ax6 = plt.subplots(figsize=(18, 10))
    ax6.axis('tight')
    ax6.axis('off')
    table = ax6.table(cellText=agg_metrics.values,
                      rowLabels=agg_metrics.index,
                      colLabels=agg_metrics.columns,
                      loc='center',
                      bbox=[0, 0, 0.8, 0.3])  # Adjust these values as needed
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(1.2, 1.2)
    ax6.set_title("Cross Validation Metrics (10 Runs)\nMean and Standard Deviation", fontsize=28)
    pdf.savefig(fig6)
    plt.close(fig6)
    
    # Page 7: Total Processing Time.
    elapsed_time = time.time() - start_time  # Compute the total elapsed time
      
    fig7 = plt.figure(figsize=(16, 12))
    plt.axis('off')
    text = (f"Total Processing Time: {elapsed_time:.2f} seconds\n\n")
    plt.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', fontsize=28)
    pdf.savefig(fig7)
    plt.close(fig7)

print(f"PDF report has been saved as '{pdf_filename}'.")

Best parameters from GridSearchCV: {'C': 29.763514416313132, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation score: 0.8674 ± 0.0109
Test Accuracy: 0.8537 (95% CI: 0.8252 - 0.8823)
Balanced Accuracy: 0.7648
F1 Score (weighted): 0.8517
MCC: 0.8162
Classification Report:
               precision    recall  f1-score   support

           A       1.00      0.91      0.95        11
           B       1.00      0.67      0.80        12
           C       0.78      0.88      0.83        58
          Ch       0.78      0.76      0.77        33
           D       0.86      0.86      0.86        22
           E       0.76      1.00      0.87        13
           K       0.88      0.54      0.67        13
           L       0.50      0.71      0.59        14
           M       0.74      0.64      0.69        55
           P       0.80      0.67      0.73        36
           Q       0.72      0.87      0.79        30
           R       1.00      0.33      0.50         3
           S       

  f = msb / msw


PDF report has been saved as '03-SVM1.pdf'.
