In [278]:
# !pip install plotly
# !pip install hvplot
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
df = pd.DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [279]:
file_path = "country_indicators.csv"
indic_df = pd.read_csv(file_path)
indic_df

Unnamed: 0,DV_phys_or_sex_partner_p,DM_age_mean,DM_age_marr_mean,ED_educ_years_mean,FF_TFR,country_name
0,,27.78,18.58,6.29,3.575,Colombia
1,,28.67,20.28,6.16,2.752,Colombia
2,,28.89,19.96,6.40,2.546,Colombia
3,,28.52,19.80,6.79,2.835,Colombia
4,,28.59,20.34,8.79,2.447,Colombia
...,...,...,...,...,...,...
7365,21.19,27.52,18.29,4.35,5.318,Senegal
7366,38.76,29.29,18.30,3.99,4.649,Sierra Leone
7367,59.93,28.82,18.84,3.56,4.460,Sierra Leone
7368,40.98,29.31,19.15,3.88,4.970,Sierra Leone


In [280]:
indic_df = indic_df.dropna(how="any")
indic_df

Unnamed: 0,DV_phys_or_sex_partner_p,DM_age_mean,DM_age_marr_mean,ED_educ_years_mean,FF_TFR,country_name
1403,34.24,29.43,19.32,7.96,2.709,Colombia
1404,43.26,29.78,20.13,7.35,2.783,Colombia
1405,39.72,30.05,20.48,7.86,2.536,Colombia
1406,49.00,29.48,19.68,7.47,2.551,Colombia
1407,42.68,30.10,20.89,9.49,2.422,Colombia
...,...,...,...,...,...,...
7365,21.19,27.52,18.29,4.35,5.318,Senegal
7366,38.76,29.29,18.30,3.99,4.649,Sierra Leone
7367,59.93,28.82,18.84,3.56,4.460,Sierra Leone
7368,40.98,29.31,19.15,3.88,4.970,Sierra Leone


In [281]:
indic_df = indic_df.set_index('country_name')

In [282]:
indic_df

Unnamed: 0_level_0,DV_phys_or_sex_partner_p,DM_age_mean,DM_age_marr_mean,ED_educ_years_mean,FF_TFR
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Colombia,34.24,29.43,19.32,7.96,2.709
Colombia,43.26,29.78,20.13,7.35,2.783
Colombia,39.72,30.05,20.48,7.86,2.536
Colombia,49.00,29.48,19.68,7.47,2.551
Colombia,42.68,30.10,20.89,9.49,2.422
...,...,...,...,...,...
Senegal,21.19,27.52,18.29,4.35,5.318
Sierra Leone,38.76,29.29,18.30,3.99,4.649
Sierra Leone,59.93,28.82,18.84,3.56,4.460
Sierra Leone,40.98,29.31,19.15,3.88,4.970


In [283]:
indic_scaled = StandardScaler().fit_transform(indic_df)
print(indic_scaled[0:5])

[[ 2.19844281e-01  3.34111827e-01 -2.77590667e-04  5.29729543e-01
  -1.05979877e+00]
 [ 8.34759540e-01  5.35128819e-01  5.76152046e-01  3.03458000e-01
  -1.00865664e+00]
 [ 5.93429161e-01  6.90199070e-01  8.25226581e-01  4.92635848e-01
  -1.17936078e+00]
 [ 1.22606925e+00  3.62828540e-01  2.55913359e-01  3.47970435e-01
  -1.16899413e+00]
 [ 7.95219534e-01  7.18915783e-01  1.11699961e+00  1.09726309e+00
  -1.25814731e+00]]


In [284]:
# Initialize PCA model
pca = PCA(n_components=2)

In [285]:
# Get two principal components for the data
indic_pca = pca.fit_transform(indic_scaled)

In [286]:
# Transform PCA data to a DataFrame
indic_pca = pd.DataFrame(
    data=indic_pca, columns=["principal component 1","principal component 2"]
)
indic_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,0.934209,-0.354148
1,1.078247,-0.809714
2,1.48948,-0.658216
3,0.909939,-1.132377
4,1.996145,-0.982361


In [287]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(indic_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
indic_df_elbow = pd.DataFrame(elbow_data)
indic_df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [288]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(indic_pca)

# Predict clusters
predictions = model.predict(indic_pca)

# Add the predicted class columns
indic_pca["class"] = model.labels_
indic_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,0.934209,-0.354148,2
1,1.078247,-0.809714,2
2,1.48948,-0.658216,2
3,0.909939,-1.132377,2
4,1.996145,-0.982361,2


In [289]:
indic_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [290]:
indic_df = indic_df.reset_index()

In [291]:
indic_df.shape

(2256, 6)

In [292]:
indic_pca.shape

(2256, 3)

In [293]:
print(indic_df.index.values)

[   0    1    2 ... 2253 2254 2255]


In [294]:
print(indic_pca.index.values)

[   0    1    2 ... 2253 2254 2255]


In [295]:
clustered_df = indic_df.join(indic_pca)
print(clustered_df.shape)
clustered_df

(2256, 9)


Unnamed: 0,country_name,DV_phys_or_sex_partner_p,DM_age_mean,DM_age_marr_mean,ED_educ_years_mean,FF_TFR,principal component 1,principal component 2,class
0,Colombia,34.24,29.43,19.32,7.96,2.709,0.934209,-0.354148,2
1,Colombia,43.26,29.78,20.13,7.35,2.783,1.078247,-0.809714,2
2,Colombia,39.72,30.05,20.48,7.86,2.536,1.489480,-0.658216,2
3,Colombia,49.00,29.48,19.68,7.47,2.551,0.909939,-1.132377,2
4,Colombia,42.68,30.10,20.89,9.49,2.422,1.996145,-0.982361,2
...,...,...,...,...,...,...,...,...,...
2251,Senegal,21.19,27.52,18.29,4.35,5.318,-1.372330,0.909303,0
2252,Sierra Leone,38.76,29.29,18.30,3.99,4.649,-1.063171,-0.383649,1
2253,Sierra Leone,59.93,28.82,18.84,3.56,4.460,-1.136629,-1.425892,1
2254,Sierra Leone,40.98,29.31,19.15,3.88,4.970,-0.888892,-0.444863,1


In [296]:
country_demo = "CountryDemographics.csv"
country_demo_df = pd.read_csv(country_demo)
country_demo_df

Unnamed: 0.1,Unnamed: 0,country_name,country_code,latitude,longitude,GDP
0,0,Austria,AT,47.516231,14.550072,3.711230e+11
1,1,Australia,AU,-25.274398,133.775136,1.058007e+12
2,2,Bangladesh,BD,23.684994,90.356331,1.645154e+11
3,3,Belgium,BE,50.503887,4.469936,4.491997e+11
4,4,Burkina Faso,BF,12.238333,-1.561593,1.044752e+10
...,...,...,...,...,...,...
315,315,United States,US,37.090240,-95.712891,1.590805e+13
316,316,Uruguay,UY,-32.522779,-55.765835,3.968593e+10
317,317,South Africa,ZA,-30.559482,22.937506,3.242687e+11
318,318,Zambia,ZM,-13.133897,27.849332,1.698932e+10


In [297]:
# combined_df = clustered_df


clustered_df = clustered_df.set_index('country_name')
country_demo_df = country_demo_df.set_index('country_name')
combined_df = clustered_df.join(country_demo_df,on = "country_name")
combined_df = combined_df.drop(columns= ['country_code','principal component 1','principal component 2','Unnamed: 0'])

In [298]:
combined_df

Unnamed: 0_level_0,DV_phys_or_sex_partner_p,DM_age_mean,DM_age_marr_mean,ED_educ_years_mean,FF_TFR,class,latitude,longitude,GDP
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Colombia,34.24,29.43,19.32,7.96,2.709,2,4.570868,-74.297333,2.446231e+11
Colombia,34.24,29.43,19.32,7.96,2.709,2,4.570868,-74.297333,2.446231e+11
Colombia,34.24,29.43,19.32,7.96,2.709,2,4.570868,-74.297333,2.446231e+11
Colombia,34.24,29.43,19.32,7.96,2.709,2,4.570868,-74.297333,2.446231e+11
Colombia,43.26,29.78,20.13,7.35,2.783,2,4.570868,-74.297333,2.446231e+11
...,...,...,...,...,...,...,...,...,...
Sierra Leone,40.98,29.31,19.15,3.88,4.970,1,8.460555,-11.779889,2.891161e+09
Sierra Leone,59.51,27.56,19.80,7.24,2.931,2,8.460555,-11.779889,2.891161e+09
Sierra Leone,59.51,27.56,19.80,7.24,2.931,2,8.460555,-11.779889,2.891161e+09
Sierra Leone,59.51,27.56,19.80,7.24,2.931,2,8.460555,-11.779889,2.891161e+09


In [300]:
combined_df.to_csv('Women_well_being_clustered_data.csv')