## This notebook provides clustering analysis for Covid-19 Demographics project

In [3]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import plotly.express as px
%matplotlib inline

#%matplotlib notebook
pd.options.display.max_columns = 100
pd.options.display.max_rows = 250
plt.rcParams["figure.figsize"] = [15, 10]

In [4]:
df=pd.read_excel('Final_cleaned_scaled_5_21.xlsx')

In [5]:
df.head()

Unnamed: 0,Country Name,region,sub-region,Country Code,Income group,Population aged 0 to 14 years old (percentage),Population aged 60+ years old (percentage),Population density,Population mid-year estimates (millions),Population mid-year estimates for females (millions),Population mid-year estimates for males (millions),Sex ratio (males per 100 females),GDP(USD - billion),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic general government health expenditure (% of GDP),Domestic general government health expenditure (% of general government expenditure),Domestic general government health expenditure per capita (current US$),Domestic private health expenditure (% of current health expenditure),Domestic private health expenditure per capita (current US$),External health expenditure (% of current health expenditure),External health expenditure per capita (current US$),SI_mean1,SI_mean2,min_SI,max_SI,Days_diff,growth_1,growth_2
0,Afghanistan,Asia,Southern Asia,AFG,Low income,0.803525,0.043767,0.007011,0.026509,0.026449,0.026482,0.087618,0.000916,0.004667,0.544648,0.0,0.007256,0.008444,5.2e-05,0.970848,0.011661,0.324983,0.03888,0.550713,0.349535,0.333392,0.675709,0.237624,0.001316,0.009175
1,Albania,Europe,Southern Europe,ALB,Upper middle income,0.135111,0.56635,0.012664,0.001986,0.001957,0.001926,0.079523,0.000708,0.025532,0.312771,0.398797,0.1731,0.206896,0.013595,0.720274,0.042577,0.012294,0.00697,0.71183,0.525025,0.0,0.837798,0.138614,0.000865,0.003027
2,Algeria,Africa,Northern Africa,DZA,Upper middle income,0.485699,0.220484,0.002163,0.030005,0.030448,0.0295,0.07209,0.008431,0.028194,0.309498,0.688632,0.300792,0.239367,0.021516,0.388782,0.02253,0.000411,0.000213,0.69888,0.366403,0.166783,0.94597,0.287129,0.003403,0.023632
3,Andorra,Europe,Southern Europe,AND,High income,0.053246,0.803976,0.019778,3e-05,0.007571,0.006507,0.046064,0.000131,0.376144,0.555845,0.48436,0.344595,0.326316,0.232989,0.628525,0.532639,0.004732,0.015008,0.42795,0.124223,0.166783,0.513507,0.069307,0.000878,0.006142
4,Angola,Africa,Sub-Saharan Africa,AGO,Lower middle income,0.914152,0.02643,0.003062,0.022174,0.022966,0.021337,0.053001,0.005121,0.009628,0.059747,0.429414,0.062405,0.098969,0.004893,0.645943,0.01314,0.067718,0.013463,0.260342,0.286854,0.083304,0.675709,0.475248,3.6e-05,0.000308


In [6]:
X=df.loc[:,'Population aged 0 to 14 years old (percentage)':'growth_2']
X=X.drop(columns=['SI_mean1','growth_2'])

In [7]:
X.describe()

Unnamed: 0,Population aged 0 to 14 years old (percentage),Population aged 60+ years old (percentage),Population density,Population mid-year estimates (millions),Population mid-year estimates for females (millions),Population mid-year estimates for males (millions),Sex ratio (males per 100 females),GDP(USD - billion),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic general government health expenditure (% of GDP),Domestic general government health expenditure (% of general government expenditure),Domestic general government health expenditure per capita (current US$),Domestic private health expenditure (% of current health expenditure),Domestic private health expenditure per capita (current US$),External health expenditure (% of current health expenditure),External health expenditure per capita (current US$),SI_mean2,min_SI,max_SI,Days_diff,growth_1
count,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0
mean,0.367942,0.35893,0.023767,0.036581,0.038275,0.036999,0.072878,0.029397,0.128902,0.319369,0.555938,0.260446,0.26348,0.115449,0.462784,0.092986,0.125294,0.05771,0.450342,0.144682,0.835098,0.387861,0.021907
std,0.276107,0.275259,0.0863,0.120148,0.119888,0.120752,0.105442,0.106052,0.198111,0.182862,0.23416,0.187382,0.159142,0.191901,0.233895,0.135287,0.227805,0.132705,0.172718,0.150058,0.166002,0.2325,0.09321
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.117425,0.100498,0.003059,0.003398,0.003906,0.003688,0.046974,0.000818,0.011413,0.169906,0.401121,0.118198,0.156497,0.004945,0.272967,0.013559,0.004407,0.00437,0.332801,0.041652,0.756754,0.178218,0.000304
50%,0.322885,0.285244,0.010014,0.008013,0.008268,0.007802,0.056441,0.003294,0.038547,0.30873,0.584667,0.231951,0.239367,0.027077,0.432414,0.034515,0.021448,0.015008,0.436304,0.166783,0.837798,0.351485,0.001751
75%,0.558923,0.626101,0.019776,0.026483,0.027864,0.026937,0.066333,0.017392,0.138785,0.429803,0.740843,0.356831,0.349732,0.112658,0.608149,0.127518,0.104613,0.049035,0.562268,0.166783,0.94597,0.581683,0.008655
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [258]:
X.columns

Index(['Population aged 0 to 14 years old (percentage)',
       'Population aged 60+ years old (percentage)', 'Population density',
       'Population mid-year estimates (millions)',
       'Population mid-year estimates for females (millions)',
       'Population mid-year estimates for males (millions)',
       'Sex ratio (males per 100 females)', 'GDP(USD - billion)',
       'Current health expenditure per capita (current US$)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Domestic private health expenditure per capita (current US$)',
       'External health expend

In [8]:
X=X.drop(columns=['Current health expenditure per capita (current US$)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Domestic private health expenditure per capita (current US$)',
       'External health expenditure (% of current health expenditure)',
       'External health expenditure per capita (current US$)'])

In [17]:
pca = PCA(n_components=3)

pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [18]:
x_pca = pca.transform(X)

In [19]:
dfX=pd.DataFrame(df['Country Name'])
dfX['pca1']=x_pca[:,0]
dfX['pca2']=x_pca[:,1]
dfX['pca3']=x_pca[:,2]

In [20]:
fig = px.scatter(dfX, x="pca1", y="pca2",
                 hover_name="Country Name")

fig.show()

In [21]:
fig = px.scatter_3d(dfX, x='pca1', y='pca2', z='pca3',hover_name='Country Name')
fig.show()

In [59]:
tsne = TSNE(n_components=3, verbose=0, perplexity=30, n_iter=300)
tsne_results = tsne.fit_transform(X)

In [60]:
dfX['first']=tsne_results[:,0]
dfX['second']=tsne_results[:,1]
dfX['third']=tsne_results[:,2]

In [58]:
fig = px.scatter(dfX, x="first", y="second",
                 hover_name="Country Name")

fig.show()

In [61]:
fig = px.scatter_3d(dfX, x='first', y='second', z='third',hover_name='Country Name')
fig.show()