# K-Means++

1. Column - Name	Description
2. country	- Name of the country
3. child_mort -	Death of children under 5 years of age per 1000 live births
4. exports -	Exports of goods and services per capita. Given as %age of the GDP per capita
5. health -	Total health spending per capita. Given as %age of GDP per capita
6. imports - Imports of goods and services per capita. Given as %age of the GDP per capita
7. Income -	Net income per person
8. Inflation - The measurement of the annual growth rate of the Total GDP
9. life_expec - The average number of years a new born child would live if the current mortality patterns are to remain the same
10. total_fer - The number of children that would be born to each woman if the current age-fertility rates remain the same.
11. gdpp - The GDP per capita. Calculated as the Total GDP divided by the total population.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns

Importing the basic libraries to get started



Importing Plotly Express as it allows to vizualize the data in a much advanced fashion and we can get better insights

In [None]:
###Reading the dataset
country_df = pd.read_csv(r"./data/Country-data.csv")
country_df.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


In [None]:
country_df.shape

(167, 10)

In [None]:
country_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     167 non-null    object 
 1   child_mort  167 non-null    float64
 2   exports     167 non-null    float64
 3   health      167 non-null    float64
 4   imports     167 non-null    float64
 5   income      167 non-null    int64  
 6   inflation   167 non-null    float64
 7   life_expec  167 non-null    float64
 8   total_fer   167 non-null    float64
 9   gdpp        167 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 13.2+ KB


In [None]:
###Checking for null data
country_df.isnull().sum()

country       0
child_mort    0
exports       0
health        0
imports       0
income        0
inflation     0
life_expec    0
total_fer     0
gdpp          0
dtype: int64

In [None]:
###Getting a overview of the data
country_df.describe()

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
count,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0
mean,38.27006,41.108976,6.815689,46.890215,17144.688623,7.781832,70.555689,2.947964,12964.155689
std,40.328931,27.41201,2.746837,24.209589,19278.067698,10.570704,8.893172,1.513848,18328.704809
min,2.6,0.109,1.81,0.0659,609.0,-4.21,32.1,1.15,231.0
25%,8.25,23.8,4.92,30.2,3355.0,1.81,65.3,1.795,1330.0
50%,19.3,35.0,6.32,43.3,9960.0,5.39,73.1,2.41,4660.0
75%,62.1,51.35,8.6,58.75,22800.0,10.75,76.8,3.88,14050.0
max,208.0,200.0,17.9,174.0,125000.0,104.0,82.8,7.49,105000.0


As we can see the data set has a negative inflation for some countries which is a outlier and there is also a extremly high child mortality rate for a few countries. Lets graph these to get a better insight.

In [None]:
px.bar(country_df, x="country", y="health",template='simple_white',labels={'health':'Healthcare','country':'Country'})

In [None]:
px.bar(country_df, x="country", y="inflation",labels={'inflation':'Inflation','country':'Country'},template='simple_white')

In [None]:
px.bar(country_df, x="country", y="income",labels={'country':'Country','income':'Income'},template='simple_white')

In [None]:
px.bar(country_df, x="country", y="gdpp",labels={'gdpp':'GDP Per Capita','country':'Country'},template='simple_white')


1.   We will decide to use child mortality, income and infaltion to create clusters, Health per capita explains the countires factors such as malnourishment,wasted,stunted & death rate.

2. As we can see by the above graphs there are clear outliers in the dataset but we will choose to keep them in ,as the clustering is to decide which countries need aid and removing any outliers might result in the negligence of a country that might need help.


In [None]:
country_df2 = pd.DataFrame(country_df, columns = ['country', 'income','health','inflation','gdpp'])

In [None]:
country_df2.quantile([.05,.1, .25, .5, .75,.95], axis = 0)

Unnamed: 0,income,health,inflation,gdpp
0.05,1213.0,2.791,0.1834,465.9
0.1,1524.0,3.726,0.5878,593.8
0.25,3355.0,4.92,1.81,1330.0
0.5,9960.0,6.32,5.39,4660.0
0.75,22800.0,8.6,10.75,14050.0
0.95,48290.0,11.57,20.87,48610.0


In [None]:
country_df2.head()

Unnamed: 0,country,income,health,inflation,gdpp
0,Afghanistan,1610,7.58,9.44,553
1,Albania,9930,6.55,4.49,4090
2,Algeria,12900,4.17,16.1,4460
3,Angola,5900,2.85,22.4,3530
4,Antigua and Barbuda,19100,6.03,1.44,12200


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
###Scaling the Data
scaler = StandardScaler()
country_df3 = scaler.fit_transform(country_df2[['income','health','inflation','gdpp']])
country_df3 =pd.DataFrame(country_df3, columns = ['income','health','inflation','gdpp'])
country_df3.head()

Unnamed: 0,income,health,inflation,gdpp
0,-0.808245,0.279088,0.157336,-0.67918
1,-0.375369,-0.097016,-0.312347,-0.485623
2,-0.220844,-0.966073,0.789274,-0.465376
3,-0.585043,-1.448071,1.387054,-0.516268
4,0.101732,-0.286894,-0.601749,-0.041817


In [None]:
from sklearn.cluster import KMeans

In [None]:
###Applying Elbow Method to find optimal K
K = range(1,10)
ssd = []
for k in K:
    Km = KMeans(n_clusters = k , init = 'k-means++')
    Km = Km.fit(country_df3)
    ssd.append(Km.inertia_)


In [None]:
Elbow_data = pd.DataFrame({'Clusters' : K, 'SSD':ssd})
Elbow_data.head()

Unnamed: 0,Clusters,SSD
0,1,668.0
1,2,409.990515
2,3,328.539196
3,4,258.51464
4,5,203.97986


In [None]:
###Plotting the Elbow curve
fig = px.line(Elbow_data,x = 'Clusters',y = 'SSD', title='Elbow Cuve',template='ggplot2')
fig.show()

In [None]:
###Validating the Elbow method using sillhoutte Score
from sklearn.metrics import silhouette_score

In [None]:
for i in range(2,10):
    labels=KMeans(n_clusters=i,init="k-means++").fit(country_df3).labels_
    print ("Silhouette score for k(clusters) = "+str(i)+" is "
           +str(silhouette_score(country_df3,labels,metric="euclidean")))


Silhouette score for k(clusters) = 2 is 0.5040063072971782
Silhouette score for k(clusters) = 3 is 0.5142205155442857
Silhouette score for k(clusters) = 4 is 0.29700280936223816
Silhouette score for k(clusters) = 5 is 0.3169074111181409
Silhouette score for k(clusters) = 6 is 0.32653247373089633
Silhouette score for k(clusters) = 7 is 0.3477327527676654
Silhouette score for k(clusters) = 8 is 0.3458818164012831
Silhouette score for k(clusters) = 9 is 0.3412321402415599


In [None]:
###Building the model on the number of K as 4
Km = KMeans(n_clusters = 4 , init = 'k-means++')
Km = Km.fit(country_df3)

NameError: ignored

In [None]:
###Getting the cluster labels for each country
Km.labels_

array([2, 0, 0, 0, 0, 2, 0, 1, 1, 0, 2, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 2,
       2, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 1, 2,
       1, 0, 2, 0, 2, 0, 0, 2, 0, 1, 1, 0, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0,
       0, 2, 1, 0, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 0, 2, 1, 0, 0, 2, 2, 2,
       2, 0, 2, 1, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0,
       1, 1, 0, 3, 1, 0, 0, 2, 0, 0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 0, 2, 0,
       2, 1, 2, 2, 2, 2, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 0, 2, 2, 0, 0, 2,
       0, 2, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0], dtype=int32)

In [None]:
###Appending the Cluster-Id for K-means as a seprate column
country_df2['Cluster-ID'] = Km.labels_
country_df2.head()

Unnamed: 0,country,income,health,inflation,gdpp,Cluster-ID
0,Afghanistan,1610,7.58,9.44,553,2
1,Albania,9930,6.55,4.49,4090,0
2,Algeria,12900,4.17,16.1,4460,0
3,Angola,5900,2.85,22.4,3530,0
4,Antigua and Barbuda,19100,6.03,1.44,12200,0


In [None]:
###Plotting a 3D graph and vizualizing the clusters
px.scatter_3d(country_df2,x='inflation',y='income',z='gdpp'
                   ,color ='Cluster-ID',hover_name='country',opacity=0.7
                   ,labels={'gdpp':'GDP Per Capita','inflation':'Inflation','income':'Income','Cluster-ID':'K-Means(K-> 4)'},template='ggplot2'
              ,color_continuous_scale=px.colors.qualitative.Pastel)

In [None]:
###Plotting a 3D graph and vizualizing the clusters
px.scatter_3d(country_df2,x='inflation',y='income',z='health'
                   ,color ='Cluster-ID',hover_name='country',opacity=0.7
                   ,labels={'health':'Healthcare','inflation':'Inflation','income':'Income','Cluster-ID':'K-Means(K-> 4)'},template='ggplot2'
              ,color_continuous_scale=px.colors.qualitative.Pastel)

From the above Graphs we can see that (especially the 3D graph) all countries are in a clusters except Nigeria which forms a cluster of its own, while using k-means and k=4

In [None]:
###Comparing K-MEANS with hierarchial clustering
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree
import plotly.figure_factory as ff

In [None]:
###Using Complete Linkage
x = linkage(country_df3,method='complete',metric='euclidean')
fig = ff.create_dendrogram(x)
fig.show()

In [None]:
###Cutting the tree and using 3 custers
cut_tree(x,n_clusters=3).shape

(167, 1)

In [None]:
###Conveting the labels into a np array and reshaping so as to fit it to the DataFrame
cluster_label = cut_tree(x,n_clusters=3).reshape(-1,)
cluster_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
country_df2['Hier_C_labels'] = cluster_label
country_df2.head()

Unnamed: 0,country,income,health,inflation,gdpp,Cluster-ID,Hier_C_labels
0,Afghanistan,1610,7.58,9.44,553,2,0
1,Albania,9930,6.55,4.49,4090,0,0
2,Algeria,12900,4.17,16.1,4460,0,0
3,Angola,5900,2.85,22.4,3530,0,0
4,Antigua and Barbuda,19100,6.03,1.44,12200,0,0


In [None]:
country_df2['Cluster-ID'].value_counts()

0    85
2    53
1    28
3     1
Name: Cluster-ID, dtype: int64

In [None]:
country_df2['Hier_C_labels'].value_counts()

0    158
1      8
2      1
Name: Hier_C_labels, dtype: int64

In [None]:
###Plotting a 3-D Graph for the Hierarchial clustering method
px.scatter_3d(country_df2,x='inflation',y='income',z='health',color ='Hier_C_labels',opacity=0.7
              ,hover_name='country',labels={'gdpp':'GDP Per Capita','inflation':'Inflation','income':'Income','Hier_C_labels':'Hierarchial clusters(C-> 3)'},
              template='ggplot2'
              ,color_continuous_scale=px.colors.qualitative.Pastel)

In [None]:
###Plotting a 3-D Graph for the Hierarchial clustering method
px.scatter_3d(country_df2,x='inflation',y='income',z='health',color ='Hier_C_labels',opacity=0.7
              ,hover_name='country',labels={'health':'Healthcare','inflation':'Inflation','income':'Income','Hier_C_labels':'Hierarchial clusters(C-> 3)'},
              template='ggplot2'
              ,color_continuous_scale=px.colors.qualitative.Pastel)

From the above Graphs we can see that (especially the 3D graph) all countries are in a clusters except Nigeria which forms a cluster of its own while using the hierachial method of clustering

CONCLUSIONS:

1.  From both clusters and viewing them we can state that Nigeria needs urgent AID or at least needs the most.
2.   The K means with K as 4 gives us a better and diverse clusters and makes much more domain sense even though the the elbow method and the S.H. Score tells us diffrently
3. Given below is the DataFrame with Cluster ID changed to represent the level of Aid needed.
4. We will use the K means Cluster ID and clustering as it gives us a better insight
Levels Being {Very High, High, Moderate, Low}



In [None]:
country_Aid_urgency = pd.DataFrame(country_df2, columns = ['country','Cluster-ID'])
country_Aid_urgency.head()

Unnamed: 0,country,Cluster-ID
0,Afghanistan,2
1,Albania,0
2,Algeria,0
3,Angola,0
4,Antigua and Barbuda,0


In [None]:
country_Aid_urgency["Cluster-ID"].replace({ 0: "Moderate Aid", 1: "Low Aid", 2:" High Aid", 3:" Very High Aid"}, inplace=True)

In [None]:
country_Aid_urgency.rename(columns = {'Cluster-ID':'Aid Req LVL'}, inplace = True)

In [None]:
country_Aid_urgency.head()

Unnamed: 0,country,Aid Req LVL
0,Afghanistan,High Aid
1,Albania,Moderate Aid
2,Algeria,Moderate Aid
3,Angola,Moderate Aid
4,Antigua and Barbuda,Moderate Aid


In the below Graph you can Hover over the points and get the country name associate with the aid type

In [None]:
fig = px.scatter(country_Aid_urgency,x='country',y='Aid Req LVL',hover_name='country'
,template='simple_white',labels={'country':'Country','Aid Req LVL':'Aid Requirement'})

fig.show()

# K-Means with PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
countries = country_df.drop('country',axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
pca_countries = sc.fit_transform(countries)

In [None]:
pca = PCA(random_state=42)
pca.fit(pca_countries)
pca.components_

array([[-0.41951945,  0.28389698,  0.15083782,  0.16148244,  0.39844111,
        -0.19317293,  0.42583938, -0.40372896,  0.39264482],
       [ 0.19288394,  0.61316349, -0.24308678,  0.67182064,  0.02253553,
        -0.00840447, -0.22270674,  0.15523311, -0.0460224 ],
       [-0.02954353,  0.14476069, -0.59663237, -0.29992674,  0.3015475 ,
         0.64251951,  0.11391854,  0.01954925,  0.12297749],
       [ 0.37065326,  0.00309102,  0.4618975 , -0.07190746,  0.39215904,
         0.15044176, -0.20379723,  0.37830365,  0.53199457],
       [-0.16896968,  0.05761584,  0.51800037,  0.25537642, -0.2471496 ,
         0.7148691 ,  0.1082198 , -0.13526221, -0.18016662],
       [ 0.20062815, -0.05933283,  0.00727646, -0.03003154,  0.16034699,
         0.06628537, -0.60112652, -0.75068875,  0.01677876],
       [-0.07948854, -0.70730269, -0.24983051,  0.59218953,  0.09556237,
         0.10463252,  0.01848639,  0.02882643,  0.24299776],
       [-0.68274306, -0.01419742,  0.07249683, -0.02894642,  0

In [None]:
print("the percentage of the variance: ",pca.explained_variance_ratio_)

the percentage of the variance:  [0.4595174  0.17181626 0.13004259 0.11053162 0.07340211 0.02484235
 0.0126043  0.00981282 0.00743056]


In [None]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

In [None]:
import plotly.express as px

px.line(x=range(1,len(cumulative_variance)+1) , y=cumulative_variance, title="Scree Plot" ,labels=dict(x="Principal_Components", y="Cumulative Variance"))

In [None]:
pc2 = PCA(n_components=5,random_state=42)
transformed = pc2.fit_transform(pca_countries)
transformed.shape

(167, 5)

In [None]:
transformed_countries = pd.DataFrame(transformed,columns=['PC1','PC2','PC3','PC4','PC5'])
transformed_countries.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-2.913025,0.095621,-0.718118,1.005255,-0.15831
1,0.429911,-0.588156,-0.333486,-1.161059,0.174677
2,-0.285225,-0.455174,1.221505,-0.868115,0.156475
3,-2.932423,1.695555,1.525044,0.839625,-0.273209
4,1.033576,0.136659,-0.225721,-0.847063,-0.193007


In [None]:
transformed_countries['countries'] = country_df['country']

In [None]:
transformed_countries.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,countries
0,-2.913025,0.095621,-0.718118,1.005255,-0.15831,Afghanistan
1,0.429911,-0.588156,-0.333486,-1.161059,0.174677,Albania
2,-0.285225,-0.455174,1.221505,-0.868115,0.156475,Algeria
3,-2.932423,1.695555,1.525044,0.839625,-0.273209,Angola
4,1.033576,0.136659,-0.225721,-0.847063,-0.193007,Antigua and Barbuda


In [None]:
x = ['PC1','PC2','PC3','PC4','PC5']

In [None]:
###Applying Elbow Method to find optimal K on PCA transformed data
K_PCA = range(1,10)
SSD_PCA  = []
for k in K_PCA:
    Km = KMeans(n_clusters = k , init = 'k-means++')
    Km = Km.fit(transformed_countries[x])
    SSD_PCA.append(Km.inertia_)

In [None]:
Elbow_PCA = pd.DataFrame({'Clusters' : K_PCA, 'SSD':SSD_PCA})
Elbow_PCA.head()

Unnamed: 0,Clusters,SSD
0,1,1420.800893
1,2,968.466353
2,3,749.618939
3,4,619.055945
4,5,539.418535


In [None]:
fig_pca = px.line(Elbow_PCA,x='Clusters',y='SSD',template='ggplot2')
fig_pca.show()

In [None]:
###Validating the Elbow method using sillhoutte Score
from sklearn.metrics import silhouette_score
for i in range(2,10):
    labels=KMeans(n_clusters=i,init="k-means++").fit(transformed_countries[x]).labels_
    print ("Silhouette score for k(clusters) = "+str(i)+" is "
           +str(silhouette_score(transformed_countries[x],labels,metric="euclidean")))

Silhouette score for k(clusters) = 2 is 0.30441994992318194
Silhouette score for k(clusters) = 3 is 0.3079769786519017
Silhouette score for k(clusters) = 4 is 0.32718347402877235
Silhouette score for k(clusters) = 5 is 0.32558063247176566
Silhouette score for k(clusters) = 6 is 0.25963412018322013
Silhouette score for k(clusters) = 7 is 0.2380457930189642
Silhouette score for k(clusters) = 8 is 0.260378570451813
Silhouette score for k(clusters) = 9 is 0.24037433243660572


In [None]:
###Building the model on the number of K as 4
Km = KMeans(n_clusters = 4 , init = 'k-means++')
Km = Km.fit(transformed_countries[x])

In [None]:
transformed_countries['Cluster_ID']=Km.labels_

In [None]:
transformed_countries.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,countries,Cluster_ID
0,-2.913025,0.095621,-0.718118,1.005255,-0.15831,Afghanistan,0
1,0.429911,-0.588156,-0.333486,-1.161059,0.174677,Albania,1
2,-0.285225,-0.455174,1.221505,-0.868115,0.156475,Algeria,1
3,-2.932423,1.695555,1.525044,0.839625,-0.273209,Angola,0
4,1.033576,0.136659,-0.225721,-0.847063,-0.193007,Antigua and Barbuda,1


In [None]:
px.scatter_3d(transformed_countries,x='PC1',y='PC2',z='PC3',color='Cluster_ID',template='ggplot2',hover_name='countries',opacity=0.7)

In [None]:
px.scatter_3d(transformed_countries,x='PC3',y='PC4',z='PC5',color='Cluster_ID',template='ggplot2',hover_name='countries',opacity=0.7,color_discrete_sequence='viridis')

In [None]:
transformed_countries['AID'] = transformed_countries['Cluster_ID'].map({0 : 'Very High' , 1:'High Aid', 2:'Low Aid',3:'Moderate Aid'})

In [None]:
transformed_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   PC1         167 non-null    float64
 1   PC2         167 non-null    float64
 2   PC3         167 non-null    float64
 3   PC4         167 non-null    float64
 4   PC5         167 non-null    float64
 5   countries   167 non-null    object 
 6   Cluster_ID  167 non-null    int32  
 7   AID         0 non-null      object 
dtypes: float64(5), int32(1), object(2)
memory usage: 9.9+ KB


In [None]:
transformed_countries.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,countries,Cluster_ID,AID
0,-2.913025,0.095621,-0.718118,1.005255,-0.15831,Afghanistan,0,Very High
1,0.429911,-0.588156,-0.333486,-1.161059,0.174677,Albania,1,High Aid
2,-0.285225,-0.455174,1.221505,-0.868115,0.156475,Algeria,1,High Aid
3,-2.932423,1.695555,1.525044,0.839625,-0.273209,Angola,0,Very High
4,1.033576,0.136659,-0.225721,-0.847063,-0.193007,Antigua and Barbuda,1,High Aid


In [None]:
fig = px.scatter(transformed_countries,x='countries',y='AID',hover_name='countries'
,template='simple_white',labels={'country':'Country','Aid Req LVL':'Aid Requirement'})

fig.show()