In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/clustering/data/country.txt'

df = pd.read_csv(url)

df

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Mongolia,26.1,46.7,5.44,56.7,7710,39.200,66.2,2.64,2650
1,Sudan,76.7,19.7,6.32,17.2,3370,19.600,66.3,4.88,1480
2,Malawi,90.5,22.8,6.59,34.9,1030,12.100,53.1,5.31,459
3,Belgium,4.5,76.4,10.70,74.7,41100,1.880,80.0,1.86,44400
4,"Congo, Dem. Rep.",116.0,41.1,7.91,49.6,609,20.800,57.5,6.54,334
...,...,...,...,...,...,...,...,...,...,...
146,Finland,3.0,38.7,8.95,37.4,39800,0.351,80.0,1.87,46200
147,Costa Rica,10.2,33.2,10.90,35.0,13000,6.570,80.4,1.92,8200
148,Colombia,18.6,15.9,7.59,17.8,10900,3.860,76.4,2.01,6250
149,Montenegro,6.8,37.0,9.11,62.7,14000,1.600,76.4,1.77,6680


In [2]:
# Task:
# 1. Use K-means clustering and Hierarchical clustering to cluster the country into groups. Please choose the number of the cluster in a rational reason.
# 2. Use PCA to reduce the dimension to 2d, and visualize the cluster from K-means and Hierarchical clustering respectively.

import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage

# Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['country']))

# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # After a few tries, I found 3 clusters to be the most appropriate for this task.
df['KMeans_Cluster'] = kmeans.fit_predict(df_scaled)

# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)  # After a few tries, I found 3 clusters to be the most appropriate for this task.
df['Hierarchical_Cluster'] = hierarchical.fit_predict(df_scaled)

# PCA for dimensionality reduction
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
df['PCA1'] = df_pca[:, 0]
df['PCA2'] = df_pca[:, 1]

# Visualization of clusters (K-Means) using Scatterplot
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1)
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='KMeans_Cluster', palette='viridis')
plt.title('K-Means Clustering')

# Visualization of clusters (Hierarchical) using Scatterplot
plt.subplot(1, 2, 2)
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='Hierarchical_Cluster', palette='viridis')
plt.title('Hierarchical Clustering')
plt.show()

# Dendrogram for Hierarchical Clustering
linked = linkage(df_scaled, 'ward')
plt.figure(figsize=(10, 7))
dendrogram(linked, labels=df['country'].values, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Dendrogram')
plt.show()

# Identifying the most underdeveloped country (based on cluster centroids)
kmeans_centroids = kmeans.cluster_centers_
distances = np.linalg.norm(df_scaled - kmeans_centroids[kmeans.labels_], axis=1)
df['Distance_to_Centroid'] = distances

underdeveloped_country = df.loc[df['Distance_to_Centroid'].idxmax()]['country']
print(f"The most underdeveloped country is: {underdeveloped_country}")

# Note: The plots for this exercise can be found as JPGs in the same GitHub repository as this ipynb-file.

The most underdeveloped country is: Nigeria


In [None]:
# 3. Please write the suggestion to CEO about the country you suggest

# Suggestion: After conducting a thorough analysis using socio-economic and health factors to categorize countries, I have identified several nations among the most underdeveloped.
# Notably, Nigeria emerges as the most underdeveloped country, followed closely by Lesotho, Federated States of Micronesia, Liberia, Kiribati, Equatorial Guinea, and the Republic of Congo.

# Strategic Recommendations: Given that Nigeria ranks as the most underdeveloped country, allocating a significant portion of the funds towards targeted interventions could yield substantial impact.
# Priority areas could include healthcare infrastructure, education accessibility, and socio-economic development programs tailored to local needs.

# Considering the similarities in developmental challenges faced by Lesotho, Federated States of Micronesia, Liberia, Kiribati, Equatorial Guinea, and the Republic of Congo, a regional approach could be beneficial.
# Collaborative efforts aimed at improving basic healthcare, access to clean water, and sustainable economic opportunities could effectively utilize resources while fostering regional stability.