In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import hdbscan
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
df = pd.read_csv("final_merged_df.csv")
df.head()

In [None]:
missing_values = df.isnull().sum()
missing_values

## **HDBSCAN**

### **Identify regions with sparse development where extending the grid is inefficient but wind microgrids could be feasible.**

* Since HDBSCAN is a density-based algorithm,the clustering approach effectively groups areas based on density and other characteristics.
* These clusters help indicate areas where extending the main grid might be inefficient, making them candidates for decentralized energy solutions, like wind or solar microgrids.
* This algorithm is valuable for identifying potential zones for alternative energy solutions based on current density and infrastructure access patterns.

In [None]:
# One-hot encode 'Income_Distribution' if it's categorical
df = pd.get_dummies(df, columns=['Income_Distribution'], prefix='Income')

In [None]:
n=250000
m = 2500
s = 5012
e = 178934
j = 2500
df_1=df.head(j)
df_2 = df.tail(m)
df_3=df.iloc[s:e]
df_4=df.sample(n=n)

In [None]:
# df_1.to_csv("../first_2500_rows.csv")
# df_2.to_csv("../last_2500_rows.cvs")
df_3.to_csv("middle_rows.csv")
df_4.to_csv("random_rows.csv")

In [None]:
df.to_csv("../test.csv")

In [None]:
# The one-hot encoded columns starting with "Income_"should be included
income_columns = [col for col in df.columns if col.startswith('Income_')]
clustering_data = df[['Pop_Density_2020', 'Wind_Speed', 'Latitude', 'Longitude', 'Grid_Value'] + income_columns]

In [None]:
# Standardize the data to bring all features to a similar scale
scaler = StandardScaler()
clustering_data_scaled = scaler.fit_transform(clustering_data)

In [None]:
# Apply PCA to reduce to a manageable number of components
pca = PCA(n_components=2)  # Adjust to 2 components for efficient clustering
clustering_data_reduced = pca.fit_transform(clustering_data_scaled)

In [None]:
# Add PCA components back to the DataFrame
df['PCA_Component_1'] = clustering_data_reduced[:, 0]
df['PCA_Component_2'] = clustering_data_reduced[:, 1]

In [None]:
# HDBSCAN clustering with optimized parameters
hdbscan_clusterer = hdbscan.HDBSCAN(metric='manhattan', min_samples=10, min_cluster_size=50)
clusters = hdbscan_clusterer.fit_predict(clustering_data_reduced)
stability_scores = hdbscan_clusterer.probabilities_

In [None]:
# Add clusters and stability scores back to the original DataFrame
df['Cluster'] = clusters
df['Stability_Score'] = stability_scores

In [None]:
# Apply HDBSCAN with optimized parameters
# hdbscan_clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=50)  # Adjust min_samples and min_cluster_size as needed
# clusters = hdbscan_clusterer.fit_predict(clustering_data_reduced)

In [None]:
# Filter out noise points (-1 label in HDBSCAN) before calculating metrics
clustered_data = clustering_data_reduced[clusters != -1]
valid_clusters = clusters[clusters != -1]

In [None]:
# Access cluster stability scores from HDBSCAN
# stability_scores = hdbscan_clusterer.probabilities_

In [None]:
if len(set(valid_clusters)) > 1:  # Ensure there's more than one cluster for evaluation
    db_index = davies_bouldin_score(clustered_data, valid_clusters)
    ch_index = calinski_harabasz_score(clustered_data, valid_clusters)
    print("Davies-Bouldin Index:", db_index)
    print("Calinski-Harabasz Index:", ch_index)
else:
    print("Insufficient clusters for evaluation metrics")

# Print stability scores and cluster labels
print("Cluster Labels:", clusters)
print("Cluster Stability Scores:", stability_scores)

In [None]:
# Visualize clustering results
plt.figure(figsize=(10, 6))
plt.scatter(clustering_data_reduced[:, 0], clustering_data_reduced[:, 1], c=clusters, cmap='viridis', s=5)
plt.colorbar(label='Cluster')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('HDBSCAN Clustering Results on Full Dataset')
plt.show()

In [None]:
import joblib

# Save the scaler, PCA, and HDBSCAN model
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(hdbscan_clusterer, 'hdbscan_model.pkl')