In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import our dependencies
import numpy as np
import pandas as pd
import hvplot.pandas
import plotly.express as px
from functools import reduce

import datetime
import os
from dotenv import load_dotenv
from pymongo import MongoClient

import seaborn as sns
import sweetviz as sv

from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, FastICA
from sklearn.cluster import AgglomerativeClustering, Birch, KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture

In [None]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

In [None]:
# Connect to the database engine
client = MongoClient(MONGODB_URI)

In [None]:
# connect to the project db
db = client['ExpectLife']

In [None]:
# get a reference to the data collection
data = db['clustering_final_system_coded_input']

In [None]:
# Create a dataframe from the collection
combined_df = pd.DataFrame(list(data.find()))
combined_df

In [None]:
# Drop the database id data and refresh the index
combined_df = combined_df.drop(['_id'], axis=1)
combined_df = combined_df.reset_index(drop=True)
combined_df

In [None]:
# Create a new DataFrame for the country names.  Apply the country names as the new index for later merging.
country_df = pd.DataFrame()
country_df['country'] = combined_df['country']
country_df = country_df.set_index('country', drop=False)
country_df

In [None]:
# Set the DataFrame index to the country names to get them out of the way
combined_df = combined_df.set_index('country')
combined_df

# Encoding

In [None]:
# Use get_dummies() to create variables for text features.
encode_df = pd.get_dummies(combined_df, columns=['s1','s2','s3','s4','s5'])
encode_df

In [None]:
# Collect the column labels so they can be reapplied after data scaling
col_names = encode_df.columns.tolist()

# Scaling

In [None]:
# Standardize the data with MinMaxScaler().
scaler = MinMaxScaler()

encode_scaled_nda = scaler.fit_transform(encode_df)
encode_scaled_nda

In [None]:
# Convert the scaled-encoded data back to a DataFrame (nda = Numpy Data Array)

scale_encode_df = pd.DataFrame(encode_scaled_nda, index=encode_df.index)
scale_encode_df

In [None]:
# Apply the column labels to ensure the data is properly identified
scale_encode_df = scale_encode_df.set_axis(col_names, axis=1)
scale_encode_df

## PCA

In [None]:
# Initialize the parameters with starter values
nc =10   #:n_clusters
rs = 42  #:random_state
ms = 20  #:min_samples
eps = 0.65 #:eps
n_comp = 10 #:n_components

In [None]:
# Using PCA to reduce dimension to the principal components.

pca = PCA(n_components=n_comp)
#pca = IncrementalPCA(whiten=True)

# Get principal components for the demographics data

demo_pca = pca.fit_transform(encode_scaled_nda)
demo_pca

In [None]:
# Create a DataFrame with the principal components.
columnz =[]

for i in range(1,n_comp+1):
    columnz.append('pc'+str(i))

pcs_df = pd.DataFrame(data=demo_pca, columns=columnz, index=combined_df.index)
#pcs_df = pd.DataFrame(data=demo_pca, index=combined_df.index) # IncrementalPCA
pcs_df

## Compute an Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
k= list(range(1,15))

# Calculate the inertia for a range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=rs)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Create the elbow curve
elbow_data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)

elbow_df.hvplot(x='k', y='inertia', xticks=k, title='Elbow Curve')


In [None]:
# Initialize the parameters for the models below
nc = 6   #:n_clusters
rs = 42  #:random_state
ms = 20  #:min_samples
eps = 0.05 #:eps
n_comp = 8 #:n_components

## KMeans

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=nc, random_state=rs)

# Create a copy of the pcs_df for processing below
km_pcs_df = pcs_df.copy()

# Fit the model
model.fit(km_pcs_df)

# Make predictions
pred = model.predict(km_pcs_df)

# Add the predicted class columns
km_pcs_df['class'] = model.labels_
km_pcs_df.head()

In [None]:
# Drop the redundant columns
pca_scenc_df = scale_encode_df.copy()
pca_scenc_df = pca_scenc_df.drop(['both_sexes_lex','female_lex','male_lex','GDP(M$)','daily calories (2018)', 'daily plant protein (g  2013)', 'daily animal protein (g  2013)', 'population','sss_depth','govt_he','private_he','govt_he_gdp','tot_alcohol_consumption','tobacco_use_%'], axis=1)

# Create a new DataFrame including predicted clusters and demographic features.
frames = [country_df, combined_df, pca_scenc_df, km_pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = encode_df.index

# Drop the string column
clustered_df = clustered_df.drop(['s1','s2','s3','s4','s5'], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

In [None]:
# Creating a Scatter with the scale_encode_df data and the clusters
fig = px.scatter_3d(clustered_df, x='sss_depth', y='population', z='govt_he', color='class', hover_name='country', hover_data=['both_sexes_lex','population','GDP(M$)'], width=800)
fig.show()

In [None]:
# Create a hvplot.scatter plot.
sizez = clustered_df['sss_depth'] * 30

clustered_df.hvplot.scatter(x='class', y='both_sexes_lex', size=sizez, hover_cols=['country'], line_color='#c994c7', hover_line_color='magenta', by='class')

## Agglomerative Clustering

In [None]:
# define the model
model = AgglomerativeClustering(n_clusters=nc, linkage='complete')

# Create a copy of the pcs_df for processing below
ac_pcs_df = pcs_df.copy()

# Fit the model - Make predictions
pred = model.fit_predict(ac_pcs_df)

# Add the predicted class columns
ac_pcs_df['class'] = model.labels_
ac_pcs_df.head()

In [None]:
# Drop the redundant columns
pca_ac_scenc_df = scale_encode_df.copy()
pca_ac_scenc_df = pca_ac_scenc_df.drop(['both_sexes_lex','female_lex','male_lex','GDP(M$)','daily calories (2018)', 'daily plant protein (g  2013)', 'daily animal protein (g  2013)', 'population','sss_depth','govt_he','private_he','govt_he_gdp','tot_alcohol_consumption','tobacco_use_%'], axis=1)

# Create a new DataFrame including predicted clusters and demographic features.
frames = [country_df, combined_df, pca_ac_scenc_df, ac_pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = encode_df.index

# Drop the string column
clustered_df = clustered_df.drop(['s1','s2','s3','s4','s5'], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

In [None]:
# Creating a Scatter with the scale_encode_df data and the clusters
fig = px.scatter_3d(clustered_df, x='sss_depth', y='population', z='govt_he', color='class', hover_name='country', hover_data=['both_sexes_lex','population','GDP(M$)'], width=800)
fig.show()

In [None]:
# Create a hvplot.scatter plot.
sizez = clustered_df['sss_depth'] * 30

clustered_df.hvplot.scatter(x='class', y='both_sexes_lex', size=sizez, hover_cols=['country'], line_color='#c994c7', hover_line_color='magenta', by='class')

## Agglomerative Clustering - Ward

In [None]:
# define the model
model = AgglomerativeClustering(n_clusters=nc)

# Create a copy of the pcs_df for processing below
acw_pcs_df = pcs_df.copy()

# Fit the model - Make predictions
pred = model.fit_predict(acw_pcs_df)

# Add the predicted class columns
acw_pcs_df['class'] = model.labels_
acw_pcs_df.head()

In [None]:
# Drop the redundant columns
pca_acw_scenc_df = scale_encode_df.copy()
pca_acw_scenc_df = pca_acw_scenc_df.drop(['both_sexes_lex','female_lex','male_lex','GDP(M$)','daily calories (2018)', 'daily plant protein (g  2013)', 'daily animal protein (g  2013)', 'population','sss_depth','govt_he','private_he','govt_he_gdp','tot_alcohol_consumption','tobacco_use_%'], axis=1)

# Create a new DataFrame including predicted clusters and demographic features.
frames = [country_df, combined_df, pca_acw_scenc_df, acw_pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = encode_df.index

# Drop the string column
clustered_df = clustered_df.drop(['s1','s2','s3','s4','s5'], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

In [None]:
# Creating a Scatter with the scale_encode_df data and the clusters
fig = px.scatter_3d(clustered_df, x='sss_depth', y='population', z='govt_he', color='class', hover_name='country', hover_data=['both_sexes_lex','population','GDP(M$)'], width=800)
fig.show()

In [None]:
# Create a hvplot.scatter plot.
sizez = clustered_df['sss_depth'] * 30

clustered_df.hvplot.scatter(x='class', y='both_sexes_lex', size=sizez, hover_cols=['country'], line_color='#c994c7', hover_line_color='magenta', by='class')

## BIRCH

In [None]:
# define the model
model = Birch(threshold=0.01, branching_factor=45, n_clusters=nc)

# Create a copy of the pcs_df for processing below
b_pcs_df = pcs_df.copy()

# Fit the model - Make predictions
pred = model.fit_predict(b_pcs_df)

# Add the predicted class columns
b_pcs_df['class'] = model.labels_
b_pcs_df.head()

In [None]:
# Drop the redundant columns
pca_b_scenc_df = scale_encode_df.copy()
pca_b_scenc_df = pca_b_scenc_df.drop(['both_sexes_lex','female_lex','male_lex','GDP(M$)','daily calories (2018)', 'daily plant protein (g  2013)', 'daily animal protein (g  2013)', 'population','sss_depth','govt_he','private_he','govt_he_gdp','tot_alcohol_consumption','tobacco_use_%'], axis=1)

# Create a new DataFrame including predicted clusters and demographic features.
frames = [country_df, combined_df, pca_b_scenc_df, b_pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = encode_df.index

# Drop the string column
clustered_df = clustered_df.drop(['s1','s2','s3','s4','s5'], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

In [None]:
# Creating a Scatter with the scale_encode_df data and the clusters
fig = px.scatter_3d(clustered_df, x='sss_depth', y='population', z='govt_he', color='class', hover_name='country', hover_data=['both_sexes_lex','population','GDP(M$)'], width=800)
fig.show()

In [None]:
# Create a hvplot.scatter plot.
sizez = clustered_df['sss_depth'] * 30

clustered_df.hvplot.scatter(x='class', y='both_sexes_lex', size=sizez, hover_cols=['country'], line_color='#c994c7', hover_line_color='magenta', by='class')

## Spectral Clustering

In [None]:
# define the model
model = SpectralClustering(n_clusters=nc, eigen_solver='arpack', assign_labels='cluster_qr')

# Create a copy of the pcs_df for processing below
sc_pcs_df = pcs_df.copy()

# Fit the model - Make predictions
pred = model.fit_predict(sc_pcs_df)

# Add the predicted class columns
sc_pcs_df['class'] = model.labels_
sc_pcs_df.head()

In [None]:
# Drop the redundant columns
pca_sc_scenc_df = scale_encode_df.copy()
pca_sc_scenc_df = pca_sc_scenc_df.drop(['both_sexes_lex','female_lex','male_lex','GDP(M$)','daily calories (2018)', 'daily plant protein (g  2013)', 'daily animal protein (g  2013)', 'population','sss_depth','govt_he','private_he','govt_he_gdp','tot_alcohol_consumption','tobacco_use_%'], axis=1)
# Create a new DataFrame including predicted clusters and demographic features.
frames = [country_df, combined_df, pca_sc_scenc_df, sc_pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = encode_df.index

# Drop the string column
clustered_df = clustered_df.drop(['s1','s2','s3','s4','s5'], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

In [None]:
# Creating a Scatter with the scale_encode_df data and the clusters
fig = px.scatter_3d(clustered_df, x='sss_depth', y='population', z='govt_he', color='class', hover_name='country', hover_data=['both_sexes_lex','population','GDP(M$)'], width=800)
fig.show()

In [None]:
# Create a hvplot.scatter plot.
sizez = clustered_df['sss_depth'] * 30

clustered_df.hvplot.scatter(x='class', y='both_sexes_lex', size=sizez, hover_cols=['country'], line_color='#c994c7', hover_line_color='magenta', by='class')

## Gaussian Mixture Model

In [None]:
# define the model
model = GaussianMixture(n_components=n_comp, covariance_type='diag', n_init=3, max_iter=200, init_params='random_from_data')

# Create a copy of the pcs_df for processing below
g_pcs_df = pcs_df.copy()

# Fit the model - Make predictions
pred = model.fit_predict(g_pcs_df)

# Add the predicted class columns
g_pcs_df['class'] = pred
g_pcs_df.head()

In [None]:
# Drop the redundant columns
pca_g_scenc_df = scale_encode_df.copy()
pca_g_scenc_df = pca_g_scenc_df.drop(['both_sexes_lex','female_lex','male_lex','GDP(M$)','daily calories (2018)', 'daily plant protein (g  2013)', 'daily animal protein (g  2013)', 'population','sss_depth','govt_he','private_he','govt_he_gdp','tot_alcohol_consumption','tobacco_use_%'], axis=1)

# Create a new DataFrame including predicted clusters and demographic features.
frames = [country_df, combined_df, pca_g_scenc_df, g_pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = encode_df.index

# Drop the string column
clustered_df = clustered_df.drop(['s1','s2','s3','s4','s5'], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

In [None]:
# Creating a Scatter with the scale_encode_df data and the clusters
fig = px.scatter_3d(clustered_df, x='sss_depth', y='population', z='govt_he', color='class', hover_name='country', hover_data=['both_sexes_lex','population','GDP(M$)'], width=800)
fig.show()

In [None]:
# Create a hvplot.scatter plot.
sizez = clustered_df['sss_depth'] * 30

clustered_df.hvplot.scatter(x='class', y='both_sexes_lex', size=sizez, hover_cols=['country'], line_color='#c994c7', hover_line_color='magenta', by='class')