In [None]:
# Import the dependencies
import os
import sys
from dotenv import load_dotenv
from pymongo import MongoClient
import pymongoarrow as pma
from pymongoarrow.api import write
import pandas as pd
import numpy as np
import time
import numba
import random
from numba.typed import List
from functools import reduce
from itertools import combinations

from joblib import Parallel, delayed
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import HDBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score
from sklearn.preprocessing import MaxAbsScaler

import hvplot.pandas
import matplotlib.cm as cm
from matplotlib import pyplot as plt
import plotly.express as px
from ydata_profiling import ProfileReport

# Suppress YData profile report generation warnings - no actual problems to resolve.
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

### Load data from MongoDB

In [None]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get references to the data collections
data1 = db['ELR_Input_Data']
data2 = db['Encoded_Gov_Data']
data3 = db['Encoded_SSS_Data']

### Create DataFrames, Adjust columns, and set Index

In [None]:
# Create a dataframe from the ELR_Data collection
combined_df = pd.DataFrame(list(data1.find()))

# Create a dataframe from the Gov_Clusters collection
gc_df = pd.DataFrame(list(data2.find()))

# Create a dataframe from the SSS_Cluster collection
sc_df = pd.DataFrame(list(data3.find()))

In [None]:
# Create a copy of the original combined dataframe
ori_df = combined_df.copy()

# Drop the database id data and refresh the index
combined_df = combined_df.drop(['_id', 'Country', 'Year'], axis=1)
combined_df = combined_df.reset_index(drop=True)
combined_df = combined_df.set_index('Country_Year')
combined_df = combined_df.drop(['Gov Type', 'SSS Type'], axis=1)
# Sort by index
combined_df = combined_df.sort_index()

gc_df = gc_df.drop(['_id'], axis=1)
gc_df = gc_df.reset_index(drop=True)
gc_df = gc_df.set_index('Country_Year')
# Sort by index
gc_df = gc_df.sort_index()

sc_df = sc_df.drop(['_id'], axis=1)
sc_df = sc_df.reset_index(drop=True)
# Sort by index
sc_df = sc_df.sort_index()

### Scale the numeric data before combining with binary encoded data

In [None]:
# Collect the column labels so they can be reapplied after data scaling
numeric_col_names = combined_df.columns.tolist()

# Standardize the data with MaxAbsScaler().
scaler = MaxAbsScaler()
scaled_nda = scaler.fit_transform(combined_df)

# Convert the scaled-encoded data back to a DataFrame (nda = Numpy Data Array)
scaled_df = pd.DataFrame(scaled_nda, index=combined_df.index)

# Apply the column labels to ensure the data is properly identified
scaled_df = scaled_df.set_axis(numeric_col_names, axis=1)
scaled_df = scaled_df.sort_index()

In [None]:
scaled_df

### Assemble the complete dataset by merging the frames

In [None]:
# Merge the cluster DataFrames with the primary data.
frames = [scaled_df, gc_df, sc_df]
merge_frames_df = reduce(lambda left,right: pd.merge(left,right,how='left',on='Country_Year'),frames)

complete_df = merge_frames_df.copy().reset_index(drop=True)
complete_df = complete_df.set_index('Country_Year')
complete_df = complete_df.sort_index()

In [None]:
complete_df

In [None]:
# Collect the column labels so they can be reapplied after PFA
complete_col_names = complete_df.columns.tolist()

### Assemble the complete visualization dataset by merging the unscaled numeric and binary encoded frames

In [None]:
# Prepare the visualization dataframe
frames = [combined_df, gc_df, sc_df]
merge_df = reduce(lambda left,right: pd.merge(left,right,how='left',on='Country_Year'),frames)

In [None]:
viz_df = merge_df.copy().reset_index(drop=True)
viz_df = viz_df.set_index('Country_Year')
viz_df = viz_df.sort_index()

In [None]:
viz_df

In [None]:
# Define a function that generates a profile report and saves it to a file
def generate_report(df, config_file, output_file):
    profile = ProfileReport(df, config_file=config_file)
    profile.to_file(output_file)
    print(f"Report {output_file} generated.")

### Determine number of components for PCA ###

In [None]:
# Function to compute the viable PCA components for a given dataset
def compute_pca(input_data):  
    # Fit PCA on actual data
    pca_actual = PCA(svd_solver='full').fit(input_data)
    
    cumulative_variance_ratio = np.cumsum(pca_actual.explained_variance_ratio_)
    plt.plot(cumulative_variance_ratio)
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    plt.axhline(y=0.95, color='r', linestyle='-')
    plt.show()

In [None]:
# Determine the number of appropriate components for PCA (scaled (combined) data)
compute_pca(complete_df)

In [None]:
# Function to perform PCA for the provided data
def perform_pca(input_data, name, n_comp):
    pca = PCA(n_components=n_comp, random_state=42)
    pca_out = pca.fit_transform(input_data)

    # Create a DataFrame with the principal components.
    columnz =[]

    for i in range(1,n_comp+1):
        columnz.append(name + '_pc'+str(i))
    
    out_df = pd.DataFrame(data=pca_out, columns=columnz)

    out_df['Country_Year'] = ori_df['Country_Year']
    out_df = out_df.reset_index(drop=True)
    out_df = out_df.set_index('Country_Year')
    return out_df

#### Use the PCA data to guide component number selection

In [None]:
# Perform PCA on the complete_df.
complete_pca_df = perform_pca(complete_df, 'Complete', 45)

### Determine the number of (KMedoids) clusters for this complete dataset

In [None]:
# Use KMedoids and compute Davies-Bouldin scores, elbow curve, and silhouette scores to determine the optimal number of clusters.
def compute_kmedoids_cluster_metrics(data_in, n_clusters):
    """
    Function to compute cluster metrics for a given number of clusters.
    This function will be called in parallel.
    """
    # Initialize the clusterer with n_clusters value and random state for reproducibility
    clusterer = KMedoids(n_clusters=n_clusters, init='k-medoids++', random_state=42)
    cluster_labels = clusterer.fit_predict(data_in)
    
    # Compute the scores for various metrics
    davies_bouldin = davies_bouldin_score(data_in, cluster_labels)
    inertia = clusterer.inertia_
    silhouette_avg = silhouette_score(data_in, cluster_labels)
    
    return n_clusters, davies_bouldin, inertia, silhouette_avg
    
def compute_clusters_parallel(data_in, max_clusters, n_jobs=-1):
    # Parallel computation of the cluster metrics for each number of clusters from 2 to max_clusters
    parallel = Parallel(n_jobs=n_jobs)
    kmedoids_cluster_metrics_list = parallel(delayed(compute_kmedoids_cluster_metrics)(data_in, n_clusters)
                                    for n_clusters in range(2, max_clusters + 1))

    # Creating a DataFrame to store the clustering metrics
    kmedoids_cluster_metrics_df = pd.DataFrame(kmedoids_cluster_metrics_list,
                                      columns=['Num_Clusters', 'Davies_Bouldin', 'Inertia', 'Silhouette_Avg'])

    return kmedoids_cluster_metrics_df

### Compute KMedoids clusters and metrics for the complete dataset

In [None]:
# Start timing
start = time.perf_counter()

# Create a DataFrame of the results for further analysis downstream - Compute clusters for the complete dataframe
non_pca_kmedoid_cluster_scores_df = compute_clusters_parallel(data_in=complete_df, max_clusters=60, n_jobs=-1)

# Stop timing
stop = time.perf_counter()

print(f"KMedoids Clustering Execution in {stop - start:0.4f} seconds")

### Compute KMedoids clusters and metrics for the PCA dataset

In [None]:
# Start timing
pca_kmedoid_cluster_start = time.perf_counter()

# Create a DataFrame of the results for further analysis downstream - Compute clusters for the complete dataframe with PCA
pca_kmedoid_cluster_scores_df= compute_clusters_parallel(data_in=complete_pca_df, max_clusters=60, n_jobs=-1)

# Stop timing
pca_kmedoid_cluster_stop = time.perf_counter()

print(f"PCA KMedoids Clustering Execution in {pca_kmedoid_cluster_stop - pca_kmedoid_cluster_start:0.4f} seconds")

In [None]:
# Lower scores are better
pca_kmedoid_cluster_scores_df.hvplot.scatter(x='Num_Clusters', y='Davies_Bouldin')

In [None]:
# Higher scores are better
pca_kmedoid_cluster_scores_df.hvplot.scatter(x='Num_Clusters', y='Silhouette_Avg')

In [None]:
# define the model
cluster_model = KMedoids(n_clusters=55, init='k-medoids++', random_state=42)

# Fit the model and predict labels
cluster_model.fit_predict(complete_pca_df)

# Add the predicted class columns to the visualization dataset
complete_pca_viz_df = viz_df.copy()
complete_pca_viz_df['KMedoids Clusters'] = cluster_model.labels_

## Principal Feature Analysis ##

#### Define functions to select dataset features that provide relevant information for clustering. 
##### Only important features are used to compute clusters from the complete (non-pca) dataset.

In [None]:
# =============================================================================
# Custom processing function to override limitations of Numba compatibility with Numpy features
@numba.jit(nopython=True)
def custom_mean(arr, axis=0):
    if arr.ndim == 1:
        return arr.sum() / arr.shape[0]
    elif arr.ndim == 2:
        if axis == 0:
            return arr.sum(axis=0) / arr.shape[0]
        elif axis == 1:
            return arr.sum(axis=1) / arr.shape[1]
    raise ValueError("custom_mean function received an array that it can't handle with axis = {}")

In [None]:
# =============================================================================
# Function: Calinski Harbasz Score Calculation
def calculate_calinski_harbasz(np_array, labels):
    if len(np.unique(labels)) > 1:
        calinski_harbasz = calinski_harabasz_score(np_array, labels)
        return calinski_harbasz
    else:
        return 0

In [None]:
# =============================================================================
# Function: Davies-Bouldin Score Calculation
def calculate_davies_bouldin(np_array, labels):
    if len(np.unique(labels)) > 1:
        davies_bouldin = davies_bouldin_score(np_array, labels)
        return davies_bouldin
    else:
        return 0

In [None]:
# =============================================================================
# Function: Silhouette Coefficient Calculation
def calculate_silhouette(np_array, labels):
    if len(np.unique(labels)) > 1:
        silhouette_val = silhouette_score(np_array, labels)
        return silhouette_val
    else:
        return 0

In [None]:
# =============================================================================
# Function: Scatter Separability Calculation
def calculate_scatter_separability(np_array, labels):
    unique_labels = np.unique(labels)
    n_features = np_array.shape[1]
    overall_mean = custom_mean(np_array, axis=0)
    
    S_w = np.zeros((n_features, n_features))
    S_b = np.zeros((n_features, n_features))

    for label in unique_labels:
        X_k = np_array[labels == label]
        mean_k = custom_mean(X_k, axis=0).reshape(n_features, 1)
        diff = X_k - mean_k.T
        S_w += np.dot(diff.T, diff)
        mean_diff = mean_k - overall_mean.reshape(n_features, 1)
        S_b += X_k.shape[0] * np.dot(mean_diff, mean_diff.T)

    # Check if S_w is invertible
    if np.linalg.cond(S_w) < 1/sys.float_info.epsilon:
        final_ssc = np.trace(np.linalg.inv(S_w).dot(S_b))
    else:
        final_ssc = 0

    return final_ssc

In [None]:
# =============================================================================
# Function: Normalization of criterion values to remove bias due to number of clusters - Numba acceleration
@numba.jit(nopython=True)
def cross_projection_normalization(clustering_medoids, scatter_criteria_score, silhouette_criteria_score, davies_bouldin_score, calinski_harbasz_index):
    n_clusters = len(clustering_medoids)
    projections = np.zeros((n_clusters, n_clusters))

    for j in range(n_clusters):
        for k in range(j + 1, n_clusters):
            medoid_j = clustering_medoids[j]
            medoid_k = clustering_medoids[k]
            distance = np.linalg.norm(medoid_j - medoid_k)
            projections[j][k] = distance
            projections[k][j] = distance

    # Flatten the array and filter non-zero distances then calculate the mean
    flat_projections = projections.ravel()
    non_zero_projections = flat_projections[flat_projections > 0]
    mean_projection = np.mean(non_zero_projections)

    # Normalizing the criteria scores with the mean of projections
    # Adjusting the formula to consider Davies-Bouldin Score. Recall: For Davies-Bouldin, lower is better.
    # We add 1 to the normalization_factor to ensure it doesn't lead to division by zero or negative values.

    # Combined normalization factor incorporates all metrics.
    normalization_factor = (1 + mean_projection + davies_bouldin_score) 

    normalized_score = (scatter_criteria_score + silhouette_criteria_score + calinski_harbasz_index) / normalization_factor

    return normalized_score

In [None]:
# =============================================================================
# Helper function for Sequential Forward Search
def evaluate_feature_subset(subset_array, np_array, cluster_labels, clustering_medoids):
    scatter_separability = calculate_scatter_separability(subset_array, cluster_labels)
    silhouette_score = calculate_silhouette(subset_array, cluster_labels)
    davies_bouldin_score = calculate_davies_bouldin(subset_array, cluster_labels)
    calinski_harbasz_index = calculate_calinski_harbasz(subset_array, cluster_labels)
    normalized_score = cross_projection_normalization(clustering_medoids, scatter_separability, silhouette_score, davies_bouldin_score, calinski_harbasz_index)

    return normalized_score


In [None]:
# =============================================================================
# Main Function - Function that orchestrates sequential forward search for important features and 
# evaluates different numbers of clusters to locate the optimal value
def optimal_feature_clusters(np_array, clustering_algorithm):
    np_array_feature_indices = np_array.shape[1]
    available_indices = set(range(np_array_feature_indices))  # Initial set of available indices
    interim_features = set()
    starter_set = set()
    n_features = len(available_indices)
    initial_k = np.array([3, 4, 6, 7, 8, 9, 10, 11, 12])
    
    random.seed(42)
    evaluate = True
    init_k = 2
    best_k = init_k
    best_score = 0
    best_combination_score = 0
    processed_features = 0
    combination_array = []
    starter_set_size = max(1, int(0.1 * n_features))

    while evaluate:     # Simple test to enable and continue evaluation.
        print(f' Start processing with k = {init_k} and {len(available_indices)} available_indices... ')

        if clustering_algorithm == 'kmedoids':
            clustering_instance = KMedoids(n_clusters=init_k, init='k-medoids++', metric='manhattan', random_state=42)          
        elif clustering_algorithm == 'hdbscan':
            clustering_instance = HDBSCAN(min_cluster_size=10, min_samples=20, cluster_selection_method='eom', store_centers="medoid", allow_single_cluster=np.bool_(True), n_jobs=-1)
        else:
            raise ValueError("Unsupported clustering algorithm")

        while processed_features < 0.8 * n_features:
            if len(interim_features) == 0 or interim_features is None:
                print(' interim_features is currently empty - creating starter_set from scratch ')
                starter_set = np.random.choice(n_features, starter_set_size)
            else:
                print(' Interim_features exist - determining features available for use in starter_set ... ')
                remaining_indices = available_indices - interim_features
                
                if len(remaining_indices) < starter_set_size:
                    print(' Almost at the end - using remaining features as the starter set... ')
                    starter_set = np.random.choice(list(remaining_indices), len(remaining_indices), replace=False)
                else:
                    print(' Creating a starter set of fresh features ... ')
                    starter_set = np.random.choice(list(remaining_indices), starter_set_size, replace=False)
                    
            best_feature = None
            best_score = 10
    
            for feature in range(n_features):
                if feature not in starter_set and feature not in interim_features:  # Check for both conditions
                    combined_features = np.concatenate([starter_set, [feature]])
                    subset_array = np.hstack([np_array[:, combined_features]])
                    current_labels = clustering_instance.fit_predict(subset_array)
                    if clustering_algorithm == 'kmedoids':
                        clustering_medoids = clustering_instance.cluster_centers_
                    elif clustering_algorithm == 'hdbscan':
                        clustering_medoids = clustering_instance.medoids_
                    # Score the feature                  
                    normalized_score = evaluate_feature_subset(subset_array, np_array, current_labels, clustering_medoids)
    
                    # Update best feature if necessary
                    if normalized_score > best_score:
                        best_score = normalized_score
                        best_feature = feature
        
                # Ensure best_feature is not already in interim_features before appending
                if best_feature is not None and best_feature not in interim_features:
                    interim_features.add(best_feature)
                    print(f' Found a new interim feature - {len(interim_features)} found so far... ')
                    available_indices.remove(best_feature)
                    
            processed_features += len(starter_set)  # Account for multiple features in starter set
            print(f' Processed {processed_features} % of Features')
        best_combination_score = best_score
        subset_array = []
        starter_set = set()
        print(f'^^^^^^^^^^^^^^^^^^^^^^^^^^{len(interim_features)} Initial features identified - continuing with combination evaluation ^^^^^^^^^^^^^^^^^')
        
        for init_k in initial_k:
            if clustering_algorithm == 'kmedoids':
                print(f'****** k-value set to {init_k} ***************************')
                clustering_instance = KMedoids(n_clusters=init_k, init='k-medoids++', metric='manhattan', random_state=42)  
            # Process combinations with the current selected features

            best_add = None
            
            for i in available_indices:
                # Create combination subset
                #print(f' Evaluate combined features - current feature = {i} ')
                combination_array = np.hstack([np_array[:, [i]], np_array[:, interim_features]])
                current_labels = clustering_instance.fit_predict(combination_array)
                if clustering_algorithm == 'kmedoids':
                    clustering_medoids = clustering_instance.cluster_centers_
                elif clustering_algorithm == 'hdbscan':
                    clustering_medoids = clustering_instance.medoids_
    
                # Score the combination            
                normalized_score = evaluate_feature_subset(combination_array, np_array, current_labels, clustering_medoids)
                    
                # Update best combination if necessary
                if normalized_score > best_combination_score:
                    best_combination_score = normalized_score
                    print(f'  ++++++++ Best Combination Score Updated = {best_combination_score} ++++++++')
                    best_add = i
                    best_k = init_k
    
            # If a better combination was found, add its feature to selected features
            if best_add is not None and best_add not in interim_features:
                interim_features.add(best_add)
                available_indices.remove(best_add)
                print(f' number of interim features = {len(interim_features)} and number of available_indices = {len(available_indices)}... ')
            else:
                #available_indices.remove(best_add)
                print('xxxxx  No further progress >>>> Moving to next K >>>>>>>')

        evaluate = False
        print(f' Processing completed - Total number of identified features = {len(interim_features)}')

    return best_k, interim_features


### Perform PFA

#### KMedoids

In [None]:
# Start timing
start = time.perf_counter()

best_k = -99
best_kmedoid_features = []

complete1_df = complete_df.copy()
complete1_np = complete1_df.to_numpy()
# Run the experiment using the complete (non-pca) dataframe and identify the clustering algorithm by name.
best_k, best_kmedoid_features = optimal_feature_clusters(complete1_np, 'kmedoids')

# Stop timing
stop = time.perf_counter()

print(f' ^^^ RUN #1 --- PFA KMedoids Clustering Execution in {stop - start:0.4f} seconds ^^^ ')
print(f' Best k = {best_k}')
print(f' best features = {best_kmedoid_features}')

In [None]:
# Start timing
start = time.perf_counter()

best_k2 = -99
best_kmedoid_features2 = []

complete2_df = complete_df.copy()
complete2_np = complete2_df.to_numpy()
# Run the experiment using the complete (non-pca) dataframe and identify the clustering algorithm by name.
best_k2, best_kmedoid_features2 = optimal_feature_clusters(complete2_np, 'kmedoids')

# Stop timing
stop = time.perf_counter()

print(f' ^^^ RUN #2 --- PFA KMedoids Clustering Execution in {stop - start:0.4f} seconds ^^^ ')
print(f' Best k = {best_k2}')
print(f' best features = {best_kmedoid_features2}')

In [None]:
# Start timing
start = time.perf_counter()

best_k3 = -99
best_kmedoid_features3 = []

complete3_df = complete_df.copy()
complete3_np = complete3_df.to_numpy()
# Run the experiment using the complete (non-pca) dataframe and identify the clustering algorithm by name.
best_k3, best_kmedoid_features3 = optimal_feature_clusters(complete3_np, 'kmedoids')

# Stop timing
stop = time.perf_counter()

print(f' ^^^ RUN #3 --- PFA KMedoids Clustering Execution in {stop - start:0.4f} seconds ^^^ ')
print(f' Best k = {best_k3}')
print(f' best features = {best_kmedoid_features3}')

### Perform clustering with the reduced feature set

In [None]:
# Use the selected features for the final KMedoids clustering
kmedoids_final_df = complete_df.copy()
kmedoids_reduced_features_df = kmedoids_final_df.iloc[:,best_kmedoid_features]
#kmedoids_reduced_features_df = complete_df[best_kmedoid_features].copy()

# Perform clustering on the final set of features
kmedoids_final_model = KMedoids(n_clusters=best_k, init='k-medoids++', metric='manhattan', random_state=42)
kmedoids_final_labels = kmedoids_final_model.fit_predict(kmedoids_reduced_features_df)
kmedoids_final_cluster_centers = kmedoids_final_model.cluster_centers_

# Create the dataframes for visualization
kmedoids_final_reduced_features_df = viz_df[best_kmedoid_features].copy()
kmedoids_final_reduced_features_df['KMedoids Clusters'] = kmedoids_final_labels

kmedoids_final_complete_features_df = viz_df.copy()
kmedoids_final_complete_features_df['KMedoids Clusters'] = kmedoids_final_labels

In [None]:
# Start timing
start = time.perf_counter()

# Create YData reports to explore the KMedoids feature relationships
# DataFrames and configuration for the reports
reports_info = [
    {
        'df': kmedoids_final_reduced_features_df,
        'config_file': 'config_ELR.yml',
        'output_file': 'KMedoids_Final_Reduced-Features_Report.html'
    },
    {
        'df': kmedoids_final_complete_features_df,
        'config_file': 'config_ELR.yml',
        'output_file': 'KMedoids_Final_Complete-Features_Report.html'
    }
]

# Use joblib to run the report generations in parallel
# n_jobs=-1 uses all available CPUs
Parallel(n_jobs=-1)(delayed(generate_report)(
    info['df'], info['config_file'], info['output_file']) for info in reports_info)

# Stop timing
stop = time.perf_counter()

print(f' ^^^ Final KMedoids Clustering Report building in {stop - start:0.4f} seconds ^^^ ')

#### HDBSCAN

In [None]:
# Start timing
start = time.perf_counter()

complete4_df = complete_df.copy()
complete4_np = complete4_df.to_numpy()

best_hdbscan_features = []
# Run the experiment using the complete (non-pca) dataframe
not_used, best_hdbscan_features = optimal_feature_clusters(complete4_np, 'hdbscan')

# Stop timing
stop = time.perf_counter()

print(f' ^^^RUN #1 --- PFA HDBSCAN Clustering Execution in {stop - start:0.4f} seconds ^^^ ')
print(f' best features 1 = {best_hdbscan_features}')

In [None]:
# Start timing
start = time.perf_counter()

complete5_df = complete_df.copy()
complete5_np = complete5_df.to_numpy()

best_hdbscan_features_2 = []
# Run the experiment using the complete (non-pca) dataframe
not_used, best_hdbscan_features_2 = optimal_feature_clusters(complete5_np, 'hdbscan')

# Stop timing
stop = time.perf_counter()

print(f' ^^^RUN #2 --- PFA HDBSCAN Clustering Execution in {stop - start:0.4f} seconds ^^^ ')
print(f' best features 2 = {best_hdbscan_features_2}')

In [None]:
# Start timing
start = time.perf_counter()

complete6_df = complete_df.copy()
complete6_np = complete6_df.to_numpy()

best_hdbscan_features_3 = []
# Run the experiment using the complete (non-pca) dataframe
not_used, best_hdbscan_features_3 = optimal_feature_clusters(complete6_np, 'hdbscan')

# Stop timing
stop = time.perf_counter()

print(f' ^^^RUN #3 --- PFA HDBSCAN Clustering Execution in {stop - start:0.4f} seconds ^^^ ')
print(f' best features 3  = {best_hdbscan_features_3}')

### Perform clustering with the reduced feature set

In [None]:
# Use the selected features for the final HDBSCAN clustering
hdbscan_reduced_features_np = np.hstack([complete_np[:, best_hdbscan_features]])
hdbscan_reduced_features_df = pd.DataFrame(hdbscan_reduced_features_np)
#hdbscan_reduced_features_df = complete_df[best_hdbscan_features].copy()

# Perform clustering on the final set of features
hdbscan_final_model = HDBSCAN(min_cluster_size=25, store_centers='medoid', n_jobs=-1)
hdbscan_final_labels = hdbscan_final_model.fit_predict(hdbscan_reduced_features_df)
hdbscan_final_cluster_centers = hdbscan_final_model.medoids_

# Create the dataframes for visualization
hdbscan_final_reduced_features_df = viz_df[best_hdbscan_features].copy()
hdbscan_final_reduced_features_df['HDBSCAN Clusters'] = hdbscan_final_labels

hdbscan_final_complete_features_df = viz_df.copy()
hdbscan_final_complete_features_df['HDBSCAN Clusters'] = hdbscan_final_labels

### Generate reports to explore the clustering results (reduced feature set & complete feature set)

In [None]:
# Start timing
start = time.perf_counter()

# Create YData reports to explore the HDBSCAN feature relationships
# DataFrames and configuration for the reports
reports_info = [
    {
        'df': hdbscan_final_reduced_features_df,
        'config_file': 'config_ELR.yml',
        'output_file': 'HDBSCAN_Final_Reduced-Features_Report.html'
    },
    {
       'df': hdbscan_final_complete_features_df,
       'config_file': 'config_ELR.yml',
       'output_file': 'HDBSCAN_Final_Complete-Features_Report.html'
    }
]

# Use joblib to run the report generations in parallel
# n_jobs=-1 uses all available CPUs
Parallel(n_jobs=-1)(delayed(generate_report)(
    info['df'], info['config_file'], info['output_file']) for info in reports_info)

# Stop timing
stop = time.perf_counter()

print(f" ^^^ Final HDBSCAN Clustering Report building in {stop - start:0.4f} seconds ^^^ ")

### Write Results to Project Database ###

In [None]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get a reference to the data collection
#gov_data = db['Encoded_Gov_Data']

In [None]:
# prefered method - use PyMongoArrow - write the dataframes to the database
write(db.Cluster_Unscaled_Complete, viz_df)
write(db.Cluster_Scaled_Complete, complete_df)
write(db.Cluster_PCA_Complete, complete_pca_df)
write(db.Cluster_KMedoids_Reduced_Features, kmedoids_final_reduced_features_df)
write(db.Cluster_KMedoids_Complete_Features, kmedoids_final_complete_features_df)
write(db.Cluster_HDBSCAN_Reduced_Features, hdbscan_final_reduced_features_df)
write(db.Cluster_HDBSCAN_Complete_Features, hdbscan_final_complete_features_df)

In [None]:
#kmedoids_cluster_centers_df = pd.DataFrame(kmedoids_final_cluster_centers)
#write(db.Cluster_KMedoids_Centers, kmedoids_cluster_centers_df)

# Create the dataframe
#kmedoids_labels_df = pd.DataFrame(kmedoids_final_labels)
#write(db.Cluster_KMedoids_Labels, kmedoids_labels_df)

# Create the dataframe
#hdbscan_centers_df = pd.DataFrame(hdbscan_final_cluster_centers)
#write(db.Cluster_HDBSCAN_Centers, hdbscan_centers_df)

# Create the dataframe
#hdbscan_labels_df = pd.DataFrame(hdbscan_final_labels)
#write(db.Cluster_HDBSCAN_Labels, hdbscan_labels_df)





In [None]:
kmedoids_best_features_df = pd.DataFrame()
kmedoids_best_features_df['Features'] = best_kmedoid_features
kmedoids_best_features_df

In [None]:
hbdbscan_best_features_df = pd.DataFrame()
hbdbscan_best_features_df['Features'] = best_hdbscan_features
hbdbscan_best_features_df