In [None]:
- HELP International is an international humanitarian NGO that is committed to fighting poverty and providing the people of backward countries with basic amenities and relief during the time of disasters and natural calamities. It runs a lot of operational projects from time to time along with advocacy drives to raise awareness as well as for funding purposes.
- The significant issues that come while making this decision are mostly related to choosing the countries that are in the direst need of aid. 
- Objective is to cluster the countries by the factors mentioned above and then present your solution and recommendations to the CEO using a PPT

# importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

### Importing Data

#loading give data set
Countries_df=pd.read_csv('Country-data.csv')

#previewing the data
Countries_df.head()

###  Understanding the data

#Size of the data
Countries_df.shape

#check for data types of variables
Countries_df.info()

##checking for missing values
Countries_df.isnull().sum()

- There are no missing values in the given data set

plt.figure(figsize=(28,14))
sns.pairplot(x_vars=['child_mort','exports','health','imports','inflation','life_expec','total_fer','gdpp'],y_vars=['income','child_mort','gdpp'],data=Countries_df)

### Outllier detection

- As mentioned in session performing outlier analysis only after PCA

#check the spread of data of variables
Countries_df.describe(percentiles=[0.01,0.05,0.25,0.50,.75,.90,.99])

#create generic function which is used to plot box plot and kde plots 
'''
inputs-- dataframe,plot_type{'box' or 'kde'}, title of the plot
plots3X3 subplots plot using dataframe, type of plot and title
'''
def plot_graph(dataframe,plot_type,title):
    columns=list(dataframe.columns)[1:]
    if(plot_type=='box'):
        columns=list(dataframe.columns)[1:]
        fig,axs=plt.subplots(3,3,figsize=(18,10))
        fig.subplots_adjust(top=0.95)
        fig.suptitle(title, fontsize=16)
        sns.boxplot(dataframe[columns[0]],orient='v',ax=axs[0,0])
        sns.boxplot(dataframe[columns[1]],orient='v',ax=axs[0,1])
        sns.boxplot(dataframe[columns[2]],orient='v',ax=axs[0,2])
        sns.boxplot(dataframe[columns[3]],orient='v',ax=axs[1,0])
        sns.boxplot(dataframe[columns[4]],orient='v',ax=axs[1,1])
        sns.boxplot(dataframe[columns[5]],orient='v',ax=axs[1,2])
        sns.boxplot(dataframe[columns[6]],orient='v',ax=axs[2,0])
        sns.boxplot(dataframe[columns[7]],orient='v',ax=axs[2,1])
        sns.boxplot(dataframe[columns[8]],orient='v',ax=axs[2,2])
    elif(plot_type=='kde'):
        columns=list(dataframe.columns)
        fig,axs=plt.subplots(3,3,figsize=(18,10))
        fig.subplots_adjust(top=0.95)
        fig.suptitle(title, fontsize=16)
        sns.kdeplot(dataframe[columns[0]],ax=axs[0,0])
        sns.kdeplot(dataframe[columns[1]],ax=axs[0,1])
        sns.kdeplot(dataframe[columns[2]],ax=axs[0,2])
        sns.kdeplot(dataframe[columns[3]],ax=axs[1,0])
        sns.kdeplot(dataframe[columns[4]],ax=axs[1,1])
        sns.kdeplot(dataframe[columns[5]],ax=axs[1,2])
        sns.kdeplot(dataframe[columns[6]],ax=axs[2,0])
        sns.kdeplot(dataframe[columns[7]],ax=axs[2,1])
        sns.kdeplot(dataframe[columns[8]],ax=axs[2,2])

plot_graph(Countries_df,'box','Box plots')

#finding relation between the columns
plt.figure(figsize=(28,14))
sns.pairplot(x_vars=['income','child_mort','exports','health','imports','inflation','life_expec','total_fer','gdpp'],
             y_vars=['exports','imports','gdpp','income'],data=Countries_df)



## Scaling Data

#scaling numerical data of countries df

#separating numerical columns to a new dataframe X
X=Countries_df.iloc[:,1:]

#previewing the data
X.head()

#shape of the data
X.shape

#Standardizing the data

from sklearn.preprocessing import StandardScaler

#Instantiation of StandardScaler
scaler=StandardScaler()

# scaling and transforming the original data to standard data having a mean 0 and standard deviation of 1
scaled_data=scaler.fit_transform(X)

#creating a dataframe with scaled data
scaled_df=pd.DataFrame(scaled_data,columns=X.columns)

#previewing the dataframe
scaled_df.head()

plot_graph(scaled_df,'kde','Scaled data having mean centered at 0 and standard deviation as 1')

## Principal Component Analysis

#Importing PCA and IncrementalPCA modules
from sklearn.decomposition import PCA,IncrementalPCA

#instantiate pca with a random state
pca=PCA(random_state=42)

#fit scaled data to pca
pca.fit(scaled_df)

#Variance explained by different variables in descending order
pca.explained_variance_ratio_

### Determining optimium components

### a.Scree plot

#plotting scree plot 
fig = plt.figure(figsize=[10,6])
plt.title("Scree plot",fontdict={'fontsize':20})
plt.plot(range(1,len(pca.explained_variance_ratio_)+1),np.cumsum(pca.explained_variance_ratio_))
plt.vlines(x=5, ymax=1, ymin=0.5, colors="r", linestyles="--")
plt.hlines(y=0.945, xmax=9, xmin=1, colors="g", linestyles="--")
plt.ylabel("Cumulative variance explained")
plt.show()

### b.Using percentage of variance

#unsupervised way of determining number of components for scaled data
pca_USV=PCA(0.945)

scaled_df_USV=pca_USV.fit_transform(scaled_df)

scaled_df_USV.shape

- Both the method gave 5 components as optimum to explain 94.5% variance of scaled data

### Applying PCA to reduce dimensionality

#using incremental PCA to decrease memory and improve calculation performance
pca_Inc=IncrementalPCA(n_components=5)

#Fitting and transforming data to new basis in which explained variance is high
transformed_data=pca_Inc.fit_transform(scaled_df)

#preview shape of transformed data
transformed_data.shape

# PCA has reduced dimensions from 9 columns to 5 columns

- Verifing the correlation after transformation using heatmap

correlation_mat=np.corrcoef(transformed_data.transpose())

sns.heatmap(correlation_mat,annot=True)

- No correlation exist between the variables as all elements except diagonal elements are tending to zeros

##### Viewing linear relation between original variables with transformed variables

pca_Inc.components_

#linear relation of PC components with variables
pd.DataFrame(pca_Inc.components_,columns=scaled_df.columns,index=['PC1','PC2','PC3','PC4','PC5'])

- PC1 = - 0.42 x child_mort + 0.28 x exports + 0.15 x health + 0.16 x imports + 0.40 x income - 0.19 x inflation + 0.425 x life_expec - 0.4 x total_fer + 0.39 x gdpp
- PC2 = - 0.19 x child_mort + 0.61 x exports - 0.24 x health + 0.67 x imports + 0.02 x income - 0.003 x inflation - 0.224 x life_expec + 0.15 x total_fer - 0.046 x gdpp
- PC3 = - 0.034 x child_mort + 0.15 x exports - 0.6 x health - 0.31 x imports + 0.30 x income + 0.63 x inflation + 0.12 x life_expec + 0.028 x total_fer + 0.12 x gdpp
- PC4 = 0.37 x child_mort + 0.005 x exports + 0.45 x health - 0.08 x imports + 0.40 x income + 0.15 x inflation - 0.2 x life_expec + 0.4 x total_fer + 0.53 x gdpp
- PC5 = - 0.18 x child_mort + 0.07 x exports + 0.515 x health + 0.24 x imports - 0.25 x income + 0.717 x inflation + 0.139 x life_expec - 0.085 x total_fer - 0.178 x gdpp

#creating a data frame with columns as new PCA components
transformed_df=pd.DataFrame(transformed_data,columns=['PC'+str(i) for i in range(1,pca_Inc.n_components_+1)])

#previewing the data frame
transformed_df.head()

#Components of scaled data without outliers

### Outlier Analysis After PCA

transformed_df.head()

transformed_df.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.9,0.95,0.99]).round(5)

def box_plot(dataframe):
    columns=list(dataframe.columns)
    fig,axs=plt.subplots(2,3,figsize=(18,10))
    sns.boxplot(dataframe[columns[0]],orient='v',ax=axs[0,0])
    sns.boxplot(dataframe[columns[1]],orient='v',ax=axs[0,1])
    sns.boxplot(dataframe[columns[2]],orient='v',ax=axs[0,2])
    sns.boxplot(dataframe[columns[3]],orient='v',ax=axs[1,0])
    sns.boxplot(dataframe[columns[4]],orient='v',ax=axs[1,1])

box_plot(transformed_df)

#removing outliers if any after PCA transformation

transformed_df_without_outliers=transformed_df.copy()

Q1=np.quantile(transformed_df_without_outliers['PC1'],0.05)
Q3=np.quantile(transformed_df_without_outliers['PC1'],0.95)
IQR = Q3 - Q1
print((Q1 - 1.5*IQR))
print(Q1 + 1.5*IQR)
PC1_lower_limit=(Q1 - 1.5*IQR)
PC1_upper_limit=(Q1 + 1.5*IQR)

transformed_df_without_outliers[(transformed_df_without_outliers['PC1']>PC1_upper_limit) | (transformed_df_without_outliers['PC1']<PC1_lower_limit)]

Q1=np.quantile(transformed_df_without_outliers['PC2'],0.05)
Q3=np.quantile(transformed_df_without_outliers['PC2'],0.95)
IQR = Q3 - Q1
print((Q1 - 1.5*IQR))
print(Q1 + 1.5*IQR)
PC2_lower_limit=(Q1 - 1.5*IQR)
PC2_upper_limit=(Q1 + 1.5*IQR)

transformed_df_without_outliers[(transformed_df_without_outliers['PC2']>PC2_upper_limit) | (transformed_df_without_outliers['PC2']<PC2_lower_limit)]

Q1=np.quantile(transformed_df_without_outliers['PC3'],0.05)
Q3=np.quantile(transformed_df_without_outliers['PC3'],0.95)
IQR = Q3 - Q1
print((Q1 - 1.5*IQR))
print(Q1 + 1.5*IQR)
PC3_lower_limit=(Q1 - 1.5*IQR)
PC3_upper_limit=(Q1 + 1.5*IQR)

transformed_df_without_outliers[(transformed_df_without_outliers['PC3']>PC3_upper_limit) | (transformed_df_without_outliers['PC3']<PC3_lower_limit)]

Q1=np.quantile(transformed_df_without_outliers['PC4'],0.05)
Q3=np.quantile(transformed_df_without_outliers['PC4'],0.95)
IQR = Q3 - Q1
print((Q1 - 1.5*IQR))
print(Q1 + 1.5*IQR)
PC4_lower_limit=(Q1 - 1.5*IQR)
PC4_upper_limit=(Q1 + 1.5*IQR)

transformed_df_without_outliers[(transformed_df_without_outliers['PC4']>PC4_upper_limit) | (transformed_df_without_outliers['PC4']<PC4_lower_limit)]

Q1=np.quantile(transformed_df_without_outliers['PC5'],0.05)
Q3=np.quantile(transformed_df_without_outliers['PC5'],0.95)
IQR = Q3 - Q1
print((Q1 - 1.5*IQR))
print(Q1 + 1.5*IQR)
PC5_lower_limit=(Q1 - 1.5*IQR)
PC5_upper_limit=(Q1 + 1.5*IQR)

transformed_df_without_outliers[(transformed_df_without_outliers['PC5']>PC5_upper_limit) | (transformed_df_without_outliers['PC5']<PC5_lower_limit)]

transformed_df_without_outliers[(
(transformed_df_without_outliers['PC1']>PC1_upper_limit) | (transformed_df_without_outliers['PC1']<PC1_lower_limit)|
(transformed_df_without_outliers['PC2']>PC2_upper_limit) | (transformed_df_without_outliers['PC2']<PC2_lower_limit)|
(transformed_df_without_outliers['PC3']>PC3_upper_limit) | (transformed_df_without_outliers['PC3']<PC3_lower_limit)|
(transformed_df_without_outliers['PC4']>PC4_upper_limit) | (transformed_df_without_outliers['PC4']<PC4_lower_limit)|
(transformed_df_without_outliers['PC5']>PC5_upper_limit) | (transformed_df_without_outliers['PC5']<PC5_lower_limit))]


list(Countries_df.iloc[[91,98,113,123,133],:]['country'].values)

transformed_df_without_outliers_after_pca=transformed_df_without_outliers[~(
(transformed_df_without_outliers['PC1']>PC1_upper_limit) | (transformed_df_without_outliers['PC1']<PC1_lower_limit)|
(transformed_df_without_outliers['PC2']>PC2_upper_limit) | (transformed_df_without_outliers['PC2']<PC2_lower_limit)|
(transformed_df_without_outliers['PC3']>PC3_upper_limit) | (transformed_df_without_outliers['PC3']<PC3_lower_limit)|
(transformed_df_without_outliers['PC4']>PC4_upper_limit) | (transformed_df_without_outliers['PC4']<PC4_lower_limit)|
(transformed_df_without_outliers['PC5']>PC5_upper_limit) | (transformed_df_without_outliers['PC5']<PC5_lower_limit))]


transformed_df_without_outliers_after_pca.shape

Countries_df_without_outliers_after_pca=Countries_df.copy()

Countries_df_without_outliers_after_pca=Countries_df_without_outliers_after_pca.set_index('country').drop(list(Countries_df.iloc[[91,98,113,123,133],:]['country'].values)).reset_index()

box_plot(transformed_df_without_outliers_after_pca)

### Hopkins test

from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    n = len(X) 
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)#437 unique samples from 4372 
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

hopkins(transformed_df)

hopkins(transformed_df_without_outliers_after_pca)

### Clustering

## KMeans clustering

#importing required libraries
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#creating a kmeans instance with number of cluster as 4   
kmeans=KMeans(n_clusters=4,max_iter=100,random_state=42)

#fitting the data
kmeans.fit(transformed_df)

#preview the output label
kmeans.labels_

#### Determining optmium K value

#### a.Elbow Curve

def elbow_curve(dataframe,title):
    ssd=[]
    range_clusters=[2,3,4,5,6,7,8,9,10]
    for num_cluster in range_clusters:
        kmeans=KMeans(num_cluster,max_iter=100,random_state=42)
        kmeans.fit(dataframe)
        ssd.append(kmeans.inertia_)
    plt.title(title)
    plt.plot(range_clusters,ssd)
    plt.show()

elbow_curve(transformed_df, "Elbow curve of transformed data")

#### b. Silhoutte score

def SilhoutteScore(dataframe):
    score=[]
    for num_clusters in [2,3,4,5,6,7,8,9]:
        kmeans=KMeans(num_clusters,max_iter=100,random_state=42)
        kmeans.fit(dataframe)
        score.append(silhouette_score(dataframe,kmeans.labels_))
        print("Silhoutte score for {0} clusters is {1}".format(num_clusters,score[num_clusters-2]))
    plt.plot([2,3,4,5,6,7,8,9],score)

SilhoutteScore(transformed_df)

#### Based on silhoutte and elbow curve, going with 4 clusters for scaled data

#creating a kmeans instance with number of cluster as 4   
kmeans_without_outliers_after_pca=KMeans(n_clusters=4,max_iter=100,random_state=42)

#fitting the scaled data without outliers
kmeans_without_outliers_after_pca.fit(transformed_df_without_outliers_after_pca)

#preview the output label
kmeans_without_outliers_after_pca.labels_

elbow_curve(transformed_df_without_outliers_after_pca,"Elbow curve of transformed data without outliers")

SilhoutteScore(transformed_df_without_outliers_after_pca)

#### Based on silhoutte and elbow curve, going with 3 clusters for data without outiers after pca

###  Model Building 

#model for scaled data
kmeans=KMeans(n_clusters=4,max_iter=500,random_state=42)
kmeans.fit(transformed_df)

Cluster_id=kmeans.labels_

transformed_df['ClusterId']=Cluster_id

#model for scaled data without outliers after pca
kmeans_without_outliers_after_pca=KMeans(n_clusters=3,max_iter=500,random_state=42)
kmeans_without_outliers_after_pca.fit(transformed_df_without_outliers_after_pca)

ClusterId_without_outliers_after_pca=kmeans_without_outliers_after_pca.labels_

transformed_df_without_outliers_after_pca['ClusterId']=ClusterId_without_outliers_after_pca

transformed_df_without_outliers_after_pca['ClusterId'].shape

Countries_df_without_outliers_after_pca.shape

#merging the labes with original dataframe

Countries_df['ClusterId']=Cluster_id

Countries_df_without_outliers_after_pca['ClusterId']=ClusterId_without_outliers_after_pca

Countries_df.head()

Countries_df_without_outliers_after_pca.head()

### Data insights

#created lists color maps to use in palettes
colors_map2=['red','green']
colors_map3=['green','blue','red']
colors_map4=['orange','red','indigo','green']
colors_map5=['red','green','blue','orange','purple']
colors_map6=['violet','indigo','blue','green','orange','red']
colors_map7=['violet','indigo','blue','green','black','orange','red']
colors_map8=['violet','indigo','blue','green','yellow','orange','red','black']

def clusterplot(dataframe,hue,colormap):
    plt.figure(figsize=(12,8))
    sns.scatterplot('PC1','PC2',hue=hue,data=dataframe,palette=colormap)

clusterplot(transformed_df,'ClusterId',colors_map4)

def box_scatterplot(dataframe,colormap,box_x,box_y,scatter_x):
    fig,axs=plt.subplots(nrows=1,ncols=2,figsize=(16,8))
    sns.boxplot(box_x,box_y,data=dataframe,ax=axs[0],palette=colormap)
    sns.scatterplot(scatter_x,box_y,hue=box_x,palette=colormap,data=dataframe,ax=axs[1])

##gdpp

box_scatterplot(Countries_df,colors_map4,'ClusterId','gdpp','child_mort')

#child_mort

box_scatterplot(Countries_df,colors_map4,'ClusterId','child_mort','income')

#income

box_scatterplot(Countries_df,colors_map4,'ClusterId','income','gdpp')

- high child mortality, low income and low gdpp

#### Countries with 1 as cluster Id needs finanical aid  

# Data insights using data without outliers after pca

clusterplot(transformed_df_without_outliers_after_pca,'ClusterId',colors_map3)

## Box plot for the data without outliers after pca

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map3,'ClusterId','gdpp','child_mort')

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map3,'ClusterId','child_mort','income')

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map3,'ClusterId','income','gdpp')

#from above plots
#Countries with 0 as Cluster Id needs financial aid
#Cluster Id 0 signifies low income, low gdpp and high child mortality

- List of countries that need financial aid as per data without removal of outliers

- List of countries that need financial aid as per transformed data without outliers after pca analysis

Countries_df_without_outliers_after_pca.groupby('ClusterId').count()['country']

Countries_df_without_outliers_after_pca[Countries_df_without_outliers_after_pca['ClusterId']==1]

len(Countries_df_without_outliers_after_pca[Countries_df_without_outliers_after_pca['ClusterId']==1]) ##Iraq,Equatorial Guinea 

## Hierarchical Clustering

transformed_df.head()

from scipy.cluster.hierarchy import linkage,cut_tree,dendrogram

single_linkage=linkage(transformed_df.iloc[:,:5],method='single')

### Single linkage

plt.figure(figsize=(20,10))
dendrogram(single_linkage)
plt.show()

single_cluster_labels=cut_tree(single_linkage,n_clusters=4).reshape(-1,)

transformed_df['ClusterId(Single)']=single_cluster_labels

Countries_df['ClusterId(Single)']=single_cluster_labels

Countries_df.groupby('ClusterId(Single)').count()

clusterplot(transformed_df,'ClusterId(Single)',colors_map4)

box_scatterplot(Countries_df,colors_map4,'ClusterId(Single)','gdpp','child_mort')

box_scatterplot(Countries_df,colors_map4,'ClusterId(Single)','child_mort','income')

box_scatterplot(Countries_df,colors_map4,'ClusterId(Single)','income','gdpp')

# countries with ClusterId(Single) as 2 are in need of finanical need

Countries_df[Countries_df['ClusterId(Single)']==2]['country']

len(Countries_df[Countries_df['ClusterId(Single)']==2]['country'])

# single linakge using transformed data without outliers after pca

transformed_df_without_outliers_after_pca.head()

single_linkage_without_outliers_after_pca=linkage(transformed_df_without_outliers_after_pca.iloc[:,:5],method='single')

plt.figure(figsize=(20,10))
dendrogram(single_linkage_without_outliers_after_pca)
plt.show()

single_cluster_labels_without_outlier_after_pca=cut_tree(single_linkage_without_outliers_after_pca,n_clusters=5).reshape(-1,)

Countries_df_without_outliers_after_pca['ClusterId(Single)']=single_cluster_labels_without_outlier_after_pca

transformed_df_without_outliers_after_pca['ClusterId(Single)']=single_cluster_labels_without_outlier_after_pca

clusterplot(transformed_df_without_outliers_after_pca,'ClusterId(Single)',colors_map5)

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map5,'ClusterId(Single)','gdpp','child_mort')

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map5,'ClusterId(Single)','child_mort','income')

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map5,'ClusterId(Single)','income','gdpp')

Countries_df_without_outliers_after_pca.groupby('ClusterId(Single)').count()

Countries_df_without_outliers_after_pca[Countries_df_without_outliers_after_pca['ClusterId(Single)']==1]

# Single linkage clusters are not tight and formed clusters are not as desired

### Complete linkage

transformed_df.head()

complete_linkage=linkage(transformed_df.iloc[:,:5],method='complete')

plt.figure(figsize=(20,10))
dendrogram(complete_linkage)
plt.show()

complete_cluster_labels=cut_tree(complete_linkage,n_clusters=5).reshape(-1,)

Countries_df['ClusterId(complete)']=complete_cluster_labels

transformed_df['ClusterId(complete)']=complete_cluster_labels

Countries_df.head()

clusterplot(transformed_df,'ClusterId(complete)',colors_map5)

box_scatterplot(Countries_df,colors_map5,'ClusterId(complete)','gdpp','child_mort')

box_scatterplot(Countries_df,colors_map5,'ClusterId(complete)','child_mort','income')

box_scatterplot(Countries_df,colors_map5,'ClusterId(complete)','income','gdpp')

# countries with ClusterId(complete) as 0 are in need of finanical need

Countries_df.groupby('ClusterId(complete)').count()['country']

Countries_df[Countries_df['ClusterId(complete)']==0]['country']

Countries_df[Countries_df['ClusterId(complete)']==4]['country']

# complete linakge using transformed data without outliers after pca

transformed_df_without_outliers_after_pca.head()

complete_linkage_without_outliers_after_pca=linkage(transformed_df_without_outliers_after_pca.iloc[:,:5],method='complete')

plt.figure(figsize=(20,10))
dendrogram(complete_linkage_without_outliers_after_pca)
plt.show()

complete_cluster_labels_without_outliers_after_pca=cut_tree(complete_linkage_without_outliers_after_pca,
                                                            n_clusters=3).reshape(-1,)

Countries_df_without_outliers_after_pca['ClusterId(complete)']=complete_cluster_labels_without_outliers_after_pca

transformed_df_without_outliers_after_pca['ClusterId(complete)']=complete_cluster_labels_without_outliers_after_pca

clusterplot(transformed_df_without_outliers_after_pca,'ClusterId(complete)',colors_map3)

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map3,'ClusterId(complete)','gdpp','child_mort')

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map3,'ClusterId(complete)','child_mort','income')

box_scatterplot(Countries_df_without_outliers_after_pca,colors_map3,'ClusterId(complete)','income','gdpp')

Countries_df_without_outliers_after_pca.groupby('ClusterId(complete)').count()['country']#angola,

Countries_df_without_outliers_after_pca[Countries_df_without_outliers_after_pca['ClusterId(complete)']==0]

