<a href="https://www.kaggle.com/code/ahmedanwar89/unsupervised-learning-on-country-data-ml?scriptVersionId=152677246" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

# Import Dataset

In [2]:
df = pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv')

In [3]:
df.sample(3)

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
9,Azerbaijan,39.2,54.3,5.88,20.7,16000,13.8,69.1,1.92,5840
130,Serbia,7.6,32.9,10.4,47.9,12700,5.88,74.7,1.4,5410
90,Lithuania,6.1,65.3,7.04,67.2,21100,2.38,73.2,1.5,12000


In [4]:
df.shape

(167, 10)

***About Dataset***  
Clustering the Countries by using Unsupervised Learning for HELP International
Objective:
To categorise the countries using socio-economic and health factors that determine the overall development of the country.

***About organization:***  
HELP International is an international humanitarian NGO that is committed to fighting poverty and providing the people of backward countries with basic amenities and relief during the time of disasters and natural calamities.

***Problem Statement:***  
HELP International have been able to raise around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. So, CEO has to make decision to choose the countries that are in the direst need of aid. Hence, your Job as a Data scientist is to categorise the countries using some socio-economic and health factors that determine the overall development of the country. Then you need to suggest the countries which the CEO needs to focus on the most.

# Data Clean

In [5]:
# check data type
df.dtypes

country        object
child_mort    float64
exports       float64
health        float64
imports       float64
income          int64
inflation     float64
life_expec    float64
total_fer     float64
gdpp            int64
dtype: object

In [6]:
# check duplicated values
df.duplicated().any(), df.duplicated().sum()

(False, 0)

In [7]:
# check null values
df.isnull().any(), df.isnull().sum()

(country       False
 child_mort    False
 exports       False
 health        False
 imports       False
 income        False
 inflation     False
 life_expec    False
 total_fer     False
 gdpp          False
 dtype: bool,
 country       0
 child_mort    0
 exports       0
 health        0
 imports       0
 income        0
 inflation     0
 life_expec    0
 total_fer     0
 gdpp          0
 dtype: int64)

In [8]:
# check data validity for object columns
for col in df.select_dtypes(include='object').columns:
    if df[col].nunique() <= 15:
        print('column name: ' + col)
        print('number of unique values: ' + str(df[col].nunique()))
        print('the unique values are: ' + df[col].unique())
    elif df[col].nunique() > 15:
        print('column name: ' + col)
        print('number of unique values: ' + str(df[col].nunique()))

column name: country
number of unique values: 167


In [9]:
# check data validity for numerical columns
df.select_dtypes(exclude='object').describe().round(2)

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
count,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0,167.0
mean,38.27,41.11,6.82,46.89,17144.69,7.78,70.56,2.95,12964.16
std,40.33,27.41,2.75,24.21,19278.07,10.57,8.89,1.51,18328.7
min,2.6,0.11,1.81,0.07,609.0,-4.21,32.1,1.15,231.0
25%,8.25,23.8,4.92,30.2,3355.0,1.81,65.3,1.8,1330.0
50%,19.3,35.0,6.32,43.3,9960.0,5.39,73.1,2.41,4660.0
75%,62.1,51.35,8.6,58.75,22800.0,10.75,76.8,3.88,14050.0
max,208.0,200.0,17.9,174.0,125000.0,104.0,82.8,7.49,105000.0


In [10]:
# drop 'country' column and save new DataFrame as train_df
train_df = df.drop(columns='country')

In [11]:
# transform DataFram 'train_df' with StandardScaler() to avoid conflecting in model
scaler_df = StandardScaler().fit_transform(train_df)

# Build Model

In [12]:
# create empty list named 'inertia_list' to save every new value of inertia.
inertia_list=[]
# for loop to build model then fit it with data then evaluate it by measure its inertia
for i in range(1, 15, 1):
    # build model with KMeans Algorthim
    model = KMeans(n_clusters=i, random_state=1, n_init='auto')
    # fit model with data 'scaler_df'
    model.fit_transform(scaler_df)
    # measure inertia of model and save it in a variable named 'inertia'
    inertia = model.inertia_
    # add the new value of inertia to 'inertia_list'
    inertia_list.append(inertia)
    # print the output
    print('number of clusters: %d \t\t inertia is: %d'%(i, inertia))


number of clusters: 1 		 inertia is: 1503
number of clusters: 2 		 inertia is: 1050
number of clusters: 3 		 inertia is: 831
number of clusters: 4 		 inertia is: 758
number of clusters: 5 		 inertia is: 681
number of clusters: 6 		 inertia is: 592
number of clusters: 7 		 inertia is: 562
number of clusters: 8 		 inertia is: 539
number of clusters: 9 		 inertia is: 456
number of clusters: 10 		 inertia is: 401
number of clusters: 11 		 inertia is: 380
number of clusters: 12 		 inertia is: 370
number of clusters: 13 		 inertia is: 349
number of clusters: 14 		 inertia is: 334


In [13]:
# visulization of Elbow Method to detremine optimal number of clusters.
fig = px.line(x=range(1, 15, 1),
              y=inertia_list,
              markers=True,
              title='Elbow Method for Optimal K')

fig.update_xaxes(title='number of clusters')
fig.update_yaxes(title='inertia')

fig.show()

In [14]:
# from previous the optimal k equal 9
# build model with n_clusters=9
model = KMeans(n_clusters=9, n_init='auto', random_state=1)
# fit model with data 'scaler_df'
model.fit_transform(scaler_df)
# get the labels (clustring)
labels = model.labels_
# print output
print(labels)

[0 6 3 7 6 3 3 2 2 3 6 1 3 6 1 2 1 7 1 3 6 5 3 2 6 7 0 1 7 2 1 7 7 6 3 3 5
 7 7 6 7 6 6 6 2 3 6 3 6 7 5 1 1 2 2 5 5 6 2 5 2 6 3 7 0 1 7 1 2 3 3 3 5 2
 2 2 3 2 1 3 5 0 2 1 5 6 6 0 0 3 1 4 6 5 7 1 1 7 4 7 1 0 6 3 6 3 7 3 5 3 2
 2 7 8 2 3 5 1 1 3 3 6 6 2 6 3 0 5 3 5 6 1 0 4 1 6 1 5 6 2 3 3 5 6 2 2 5 5
 1 0 0 5 1 3 1 0 6 2 2 2 6 3 5 3 1 5 7]


In [15]:
# create new column named 'group' in DataFrame df by labels data.
df['group'] = np.array(labels)

df['group']

0      0
1      6
2      3
3      7
4      6
      ..
162    5
163    3
164    1
165    5
166    7
Name: group, Length: 167, dtype: int32

In [16]:
# check changes.
df.sample(3)

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp,group
120,Philippines,31.9,34.8,3.61,36.6,5600,4.22,69.0,3.16,2130,3
131,Seychelles,14.4,93.8,3.4,108.0,20400,-4.21,73.4,2.17,10800,1
23,Brunei,10.5,67.4,2.84,28.0,80600,16.7,77.1,1.84,35300,2


In [17]:
# select record of country equal Egypt.
df[df['country'] == 'Egypt']

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp,group
47,Egypt,29.1,21.3,4.66,26.6,9860,10.1,70.5,3.19,2600,3


In [18]:
# check group number 3.
df[df['group']==3]['country']

2                             Algeria
5                           Argentina
6                             Armenia
9                          Azerbaijan
12                         Bangladesh
19                            Bolivia
22                             Brazil
34                              China
35                           Colombia
45                 Dominican Republic
47                              Egypt
62                          Guatemala
69                              India
70                          Indonesia
71                               Iran
76                            Jamaica
79                         Kazakhstan
89                              Libya
103                          Mongolia
105                           Morocco
107                           Myanmar
109                             Nepal
115                              Oman
119                              Peru
120                       Philippines
125                            Russia
128         