In [157]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns

Importing the basic libraries to get started



Importing Plotly Express as it allows to vizualize the data in a much advanced fashion and we can get better insights

In [158]:
###Reading the dataset
country_df = pd.read_csv(r"D:\Ash\Datasets\Clustering Assignment\Country-data.csv")
country_df.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


In [159]:
###Reading the Data Dictionary to get a better understanding of the data
country_dict = pd.read_csv(r"D:\Ash\Datasets\Clustering Assignment\data-dictionary+.csv")
country_dict

Unnamed: 0,Column Name,Description
0,country,Name of the country
1,child_mort,Death of children under 5 years of age per 100...
2,exports,Exports of goods and services per capita. Give...
3,health,Total health spending per capita. Given as %ag...
4,imports,Imports of goods and services per capita. Give...
5,Income,Net income per person
6,Inflation,The measurement of the annual growth rate of t...
7,life_expec,The average number of years a new born child w...
8,total_fer,The number of children that would be born to e...
9,gdpp,The GDP per capita. Calculated as the Total GD...


In [160]:
country_df.shape

(167, 10)

In [161]:
country_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     167 non-null    object 
 1   child_mort  167 non-null    float64
 2   exports     167 non-null    float64
 3   health      167 non-null    float64
 4   imports     167 non-null    float64
 5   income      167 non-null    int64  
 6   inflation   167 non-null    float64
 7   life_expec  167 non-null    float64
 8   total_fer   167 non-null    float64
 9   gdpp        167 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 13.2+ KB


In [162]:
###Checking for null data
country_df.isnull().sum()

country       0
child_mort    0
exports       0
health        0
imports       0
income        0
inflation     0
life_expec    0
total_fer     0
gdpp          0
dtype: int64

In [163]:
###Getting a overview of the data
country_df.describe()

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
count,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0
mean,38.27006,41.108976,6.815689,46.890215,17144.688623,7.781832,70.555689,2.947964,12964.155689
std,40.328931,27.41201,2.746837,24.209589,19278.067698,10.570704,8.893172,1.513848,18328.704809
min,2.6,0.109,1.81,0.0659,609.0,-4.21,32.1,1.15,231.0
25%,8.25,23.8,4.92,30.2,3355.0,1.81,65.3,1.795,1330.0
50%,19.3,35.0,6.32,43.3,9960.0,5.39,73.1,2.41,4660.0
75%,62.1,51.35,8.6,58.75,22800.0,10.75,76.8,3.88,14050.0
max,208.0,200.0,17.9,174.0,125000.0,104.0,82.8,7.49,105000.0


As we can see the data set has a negative inflation for some countries which is a outlier and there is also a extremly high child mortality rate for a few countries. Lets graph these to get a better insight. 

In [164]:
px.bar(country_df, x="country", y="child_mort")

In [165]:
px.bar(country_df, x="country", y="inflation")

In [166]:
px.bar(country_df, x="country", y="income")

1.We will decide to use child mortality, income and infaltion to create clusters, the reason for using child mortality is that it explains health score, life expectancy .
2. As we can see by the above graphs there are clear outliers in the dataset but we will choose to keep them in ,as the clustering is to decide which countries need aid and removing any outliers might result in the negligence of a country that might need help.

In [167]:
country_df2 = pd.DataFrame(country_df, columns = ['country', 'income','child_mort','inflation'])


In [168]:
country_df2.quantile([.05,.1, .25, .5, .75,.95], axis = 0)

Unnamed: 0,income,child_mort,inflation
0.05,1213.0,3.46,0.1834
0.1,1524.0,4.2,0.5878
0.25,3355.0,8.25,1.81
0.5,9960.0,19.3,5.39
0.75,22800.0,62.1,10.75
0.95,48290.0,116.0,20.87


In [169]:
country_df2.head()

Unnamed: 0,country,income,child_mort,inflation
0,Afghanistan,1610,90.2,9.44
1,Albania,9930,16.6,4.49
2,Algeria,12900,27.3,16.1
3,Angola,5900,119.0,22.4
4,Antigua and Barbuda,19100,10.3,1.44


In [170]:
from sklearn.preprocessing import StandardScaler

In [171]:
###Scaling the Data
scaler = StandardScaler()
country_df3 = scaler.fit_transform(country_df2[['income','child_mort','inflation']])
country_df3 =pd.DataFrame(country_df3, columns = ['income','child_mort','inflation'])
country_df3.head()

Unnamed: 0,income,child_mort,inflation
0,-0.808245,1.291532,0.157336
1,-0.375369,-0.538949,-0.312347
2,-0.220844,-0.272833,0.789274
3,-0.585043,2.007808,1.387054
4,0.101732,-0.695634,-0.601749


In [172]:
from sklearn.cluster import KMeans 

In [173]:
###Applying Elbow Method to find optimal K
K = range(1,10)
ssd = []
for k in K:
    Km = KMeans(n_clusters = k , init = 'k-means++')
    Km = Km.fit(country_df3)
    ssd.append(Km.inertia_)


In [174]:
Elbow_data = pd.DataFrame({'Clusters' : K, 'SSD':ssd})
Elbow_data.head()

Unnamed: 0,Clusters,SSD
0,1,501.0
1,2,327.074518
2,3,241.61064
3,4,165.155636
4,5,132.061633


In [175]:
###Plotting the Elbow curve
fig = px.line(Elbow_data,x = 'Clusters',y = 'SSD', title='Elbow Cuve')
fig.show()

In [176]:
###Validating the Elbow method using sillhoutte Score
from sklearn.metrics import silhouette_score

In [177]:
for i in range(2,10):
    labels=KMeans(n_clusters=i,init="k-means++").fit(country_df3).labels_
    print ("Silhouette score for k(clusters) = "+str(i)+" is "
           +str(silhouette_score(country_df3,labels,metric="euclidean")))


Silhouette score for k(clusters) = 2 is 0.4024506019491648
Silhouette score for k(clusters) = 3 is 0.40017447428586883
Silhouette score for k(clusters) = 4 is 0.4009327028163799
Silhouette score for k(clusters) = 5 is 0.38895476191578776
Silhouette score for k(clusters) = 6 is 0.39751101058495303
Silhouette score for k(clusters) = 7 is 0.34702133488745734
Silhouette score for k(clusters) = 8 is 0.37110635569103184
Silhouette score for k(clusters) = 9 is 0.387513652228373


In [178]:
###Building the model on the number of K as 4
Km = KMeans(n_clusters = 4 , init = 'k-means++')
Km = Km.fit(country_df3)

In [179]:
###Getting the cluster labels for each country
Km.labels_

array([1, 0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0,
       0, 2, 0, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 2, 2,
       2, 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 0, 2, 1, 2, 0, 0, 1, 1, 0,
       1, 0, 2, 1, 0, 0, 0, 2, 2, 2, 0, 2, 0, 0, 1, 1, 2, 0, 1, 0, 0, 1,
       1, 0, 0, 2, 0, 1, 1, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       2, 2, 1, 3, 2, 2, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 2, 1, 0, 0,
       1, 2, 0, 2, 0, 0, 2, 2, 0, 0, 1, 0, 2, 2, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1])

In [180]:
###Appending the Cluster-Id for K-means as a seprate column
country_df2['Cluster-ID'] = Km.labels_
country_df2.head()

Unnamed: 0,country,income,child_mort,inflation,Cluster-ID
0,Afghanistan,1610,90.2,9.44,1
1,Albania,9930,16.6,4.49,0
2,Algeria,12900,27.3,16.1,0
3,Angola,5900,119.0,22.4,1
4,Antigua and Barbuda,19100,10.3,1.44,0


In [191]:
px.box(country_df2,y='income',x='Cluster-ID')

In [192]:
px.box(country_df2,y='child_mort',x='Cluster-ID')

In [193]:
px.box(country_df2,y='inflation',x='Cluster-ID')

In [181]:
###Plotting a 3D graph and vizualizing the clusters
px.scatter_3d(country_df2,x='inflation',y='income',z='child_mort',color ='Cluster-ID',opacity=0.7,hover_name='country')

From the above Graphs we can see that (especially the 3D graph) all countries are in a clusters except Nigeria which forms a cluster of its own, while using k-means and k=4

In [182]:
###Comparing K-MEANS with hierarchial clustering
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree
import plotly.figure_factory as ff

In [183]:
###Using Complete Linkage
x = linkage(country_df3,method='complete',metric='euclidean')
fig = ff.create_dendrogram(x)
fig.show()

In [184]:
###Cutting the tree and using 3 custers
cut_tree(x,n_clusters=3).shape

(167, 1)

In [185]:
###Conveting the labels into a np array and reshaping so as to fit it to the DataFrame
cluster_label = cut_tree(x,n_clusters=3).reshape(-1,)
cluster_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [186]:
country_df2['Hier_C_labels'] = cluster_label
country_df2.head()

Unnamed: 0,country,income,child_mort,inflation,Cluster-ID,Hier_C_labels
0,Afghanistan,1610,90.2,9.44,1,0
1,Albania,9930,16.6,4.49,0,0
2,Algeria,12900,27.3,16.1,0,0
3,Angola,5900,119.0,22.4,1,0
4,Antigua and Barbuda,19100,10.3,1.44,0,0


Plotting a box plot for each cluster to a certain variable

In [187]:

px.box(country_df2,y='income',x='Hier_C_labels')

In [188]:
px.box(country_df2,y='child_mort',x='Hier_C_labels')

In [189]:
px.box(country_df2,y='inflation',x='Hier_C_labels')

In [190]:
###Plotting a 3-D Graph for the Hierarchial clustering method
px.scatter_3d(country_df2,x='inflation',y='income',z='child_mort',color ='Hier_C_labels',opacity=0.7,hover_name='country')

From the above Graphs we can see that (especially the 3D graph) all countries are in a clusters except Nigeria which forms a cluster of its own while using the hierachial method of clustering

CONCLUSIONS:
From both clusters and viewing them we can state that Nigeria needs urgent AID or at least needs the most.


The K means with K as 4 gives us a better and diverse clusters 

Given below is the DataFrame with Cluster ID changed to represent the level of Aid needed.


We will use the K means Cluster ID and clustering as it gives us a better insight
Levels Being {Very High, High, Moderate, Low}

In [202]:
country_Aid_urgency = pd.DataFrame(country_df2, columns = ['country','Cluster-ID'])
country_Aid_urgency.head()

Unnamed: 0,country,Cluster-ID
0,Afghanistan,1
1,Albania,0
2,Algeria,0
3,Angola,1
4,Antigua and Barbuda,0


In [203]:
country_Aid_urgency["Cluster-ID"].replace({ 0: "Moderate Aid Req", 1: "High Aid Req", 2:"low Aid Req", 3:"V High Aid Req"}, inplace=True)

In [204]:
country_Aid_urgency.rename(columns = {'Cluster-ID':'Aid Req LVL'}, inplace = True)

In [205]:
country_Aid_urgency.head()

Unnamed: 0,country,Aid Req LVL
0,Afghanistan,High Aid Req
1,Albania,Moderate Aid Req
2,Algeria,Moderate Aid Req
3,Angola,High Aid Req
4,Antigua and Barbuda,Moderate Aid Req


In the below Graph you can Hover over the points and get the country name associate with the aid type

In [206]:
px.scatter(country_Aid_urgency,x='country',y='Aid Req LVL',hover_name='country')