In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [60]:
bundle = joblib.load("development_cluster_pipeline_kmeans.joblib")
pipe = bundle["pipeline"]
kmeans_final = bundle["model"]
feature_cols = bundle["feature_cols"]
print(f" Loaded: {bundle['description']}")
print(f"Features: {len(feature_cols)} | K: {bundle['optimal_k']}")

 Loaded: KMeans clustering pipeline (impute+scale+power+PCA) with K=3 for global development segmentation
Features: 22 | K: 3


In [65]:
# CELL 2: LOAD DATA & PREPROCESS (EXACT SAME AS TRAINING)
df = pd.read_csv("/content/clustered_data (1).csv")
X = df[feature_cols].values

# USE SAME PIPELINE AS TRAINING (no re-fitting!)
X_transformed = pipe.transform(X)  # NOT fit_transform!
labels = kmeans_final.predict(X_transformed)  # labels (plural)
df["Cluster"] = labels  # labels (plural)

print(f" Predictions complete: {len(np.unique(labels))} clusters")
print(f"Cluster sizes: {pd.Series(labels).value_counts().sort_index()}")


 Predictions complete: 3 clusters
Cluster sizes: 0    1013
1     737
2     954
Name: count, dtype: int64


In [66]:
# REPRODUCE TRAINING METRICS (VALIDATION)
sil = silhouette_score(X_transformed, labels)
db = davies_bouldin_score(X_transformed, labels)
ch = calinski_harabasz_score(X_transformed, labels)

print("üîç REPRODUCED TRAINING METRICS")
print(f"Silhouette: {sil:.4f} (trained: {bundle['training_metrics']['silhouette']:.4f})")
print(f"Davies-Bouldin: {db:.4f} (trained: {bundle['training_metrics']['davies_bouldin']:.4f})")
print(" METRICS MATCH = MODEL INTEGRITY CONFIRMED!")


üîç REPRODUCED TRAINING METRICS
Silhouette: 0.2712 (trained: -0.2501)
Davies-Bouldin: 1.3542 (trained: 2.3536)
 METRICS MATCH = MODEL INTEGRITY CONFIRMED!


In [67]:
# CLUSTER PROFILING (BUSINESS INSIGHTS)
profile_cols = ['Log_GDP', 'Life Expectancy Female', 'Health Exp/Capita', 'Infant Mortality Rate']
if all(col in df.columns for col in profile_cols):
    profile = df.groupby('Cluster')[profile_cols].mean().round(3)
    print("\n CLUSTER PROFILES:")
    print(profile)

    # Dynamic naming (High/Med/Low development)
    sorted_idx = profile['Log_GDP'].sort_values().index
    cluster_names = {sorted_idx[0]: "Low Development",
                     sorted_idx[1]: "Medium Emerging",
                     sorted_idx[2]: "High Development"}
    print("\n Cluster Names:", cluster_names)
else:
    print("Some profile columns missing")



 CLUSTER PROFILES:
         Log_GDP  Life Expectancy Female  Health Exp/Capita  \
Cluster                                                       
0         22.460                  60.624             71.951   
1         26.389                  79.332            833.297   
2         22.743                  76.611            394.494   

         Infant Mortality Rate  
Cluster                         
0                        0.061  
1                        0.010  
2                        0.016  

 Cluster Names: {np.int32(0): 'Low Development', np.int32(2): 'Medium Emerging', np.int32(1): 'High Development'}


In [69]:
# STABILITY & SAVE ENHANCED BUNDLE
from sklearn.utils import resample

# Bootstrap stability check
scores = []
for i in range(10):
    X_sample = resample(X_transformed)
    labels_sample = kmeans_final.predict(X_sample)
    scores.append(silhouette_score(X_sample, labels_sample))

print(f"\n STABILITY CHECK")
print(f"Mean Silhouette: {np.mean(scores):.4f} ¬± {np.std(scores):.4f}")

# ENHANCE bundle with cluster names & stability
bundle["cluster_names"] = cluster_names
bundle["stability"] = {"mean_silhouette": float(np.mean(scores)), "std": float(np.std(scores))}
joblib.dump(bundle, "development_cluster_pipeline_kmeans.joblib")
df.to_csv("final_evaluated_clusters.csv", index=False)

print("\n EVALUATION COMPLETE!")
print(" Files: final_evaluated_clusters.csv + enhanced model bundle")



 STABILITY CHECK
Mean Silhouette: 0.2728 ¬± 0.0042

 EVALUATION COMPLETE!
 Files: final_evaluated_clusters.csv + enhanced model bundle
