In [1]:
# Import the dependencies
import os
from dotenv import load_dotenv
import pymongo as pym
from pymongo import MongoClient
import pymongoarrow as pma
from pymongoarrow.api import write
import pandas as pd

import re
from sklearn.cluster import HDBSCAN
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA
import sweetviz as sv

In [2]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get a reference to the data collection
data = db['ELR_Data']

In [3]:
# Create a dataframe from the collection
combined_df = pd.DataFrame(list(data.find()))
combined_df.head(5)

Unnamed: 0,_id,Country_Year,Country,Year,Gov Type,SSS Depth,SSS Type,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),...,Current HE per capita (PPP Intl $),Capital HE (% GDP),Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters)
0,656c8a4891b27078d5aba381,Albania_1990,Albania,1990,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.946788,3.286542,...,222.786533,0.005188,2.765835,103.662764,115.435286,-11.187905,2549.746801,2568.0,35.0,6.57
1,656c8a4891b27078d5aba382,Albania_1991,Albania,1991,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.225912,3.26679,...,222.786533,0.005188,2.765835,103.662764,115.435286,-27.566821,1909.31916,2572.0,35.0,6.57
2,656c8a4891b27078d5aba383,Albania_1992,Albania,1992,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,118.505073,3.247039,...,222.786533,0.005188,2.765835,103.662764,115.435286,-6.622551,1823.503609,2654.0,35.0,6.57
3,656c8a4891b27078d5aba384,Albania_1993,Albania,1993,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.784197,3.227287,...,222.786533,0.005188,2.765835,103.662764,115.435286,10.229949,2057.692048,2795.0,35.0,6.57
4,656c8a4891b27078d5aba385,Albania_1994,Albania,1994,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.063358,3.207536,...,222.786533,0.005188,2.765835,103.662764,115.435286,8.969762,2290.143917,2877.0,35.0,6.57


In [4]:
# Drop the database id data and refresh the index
combined_df = combined_df.drop(['_id'], axis=1)
combined_df = combined_df.reset_index(drop=True)

# Create a new DataFrame for the country_years.  Apply the country_years as the new index for later merging.
country_year_df = pd.DataFrame()
country_year_df['Country_Year'] = combined_df['Country_Year']
country_year_df['Country'] = combined_df['Country']
country_year_df['Year'] = combined_df['Year']
country_year_df = country_year_df.set_index('Country_Year', drop=False)

# Set the DataFrame index to the country names to get them out of the way
combined_df= combined_df.set_index('Country_Year')
ori_df = combined_df.copy()
combined_df.head(5)

Unnamed: 0_level_0,Country,Year,Gov Type,SSS Depth,SSS Type,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),Total Labor Force (M),LEx years,...,Current HE per capita (PPP Intl $),Capital HE (% GDP),Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters)
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,Albania,1990,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.946788,3.286542,1.374478,73.144,...,222.786533,0.005188,2.765835,103.662764,115.435286,-11.187905,2549.746801,2568.0,35.0,6.57
Albania_1991,Albania,1991,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.225912,3.26679,1.429833,73.378,...,222.786533,0.005188,2.765835,103.662764,115.435286,-27.566821,1909.31916,2572.0,35.0,6.57
Albania_1992,Albania,1992,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,118.505073,3.247039,1.438342,73.715,...,222.786533,0.005188,2.765835,103.662764,115.435286,-6.622551,1823.503609,2654.0,35.0,6.57
Albania_1993,Albania,1993,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.784197,3.227287,1.413557,73.939,...,222.786533,0.005188,2.765835,103.662764,115.435286,10.229949,2057.692048,2795.0,35.0,6.57
Albania_1994,Albania,1994,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.063358,3.207536,1.391914,74.131,...,222.786533,0.005188,2.765835,103.662764,115.435286,8.969762,2290.143917,2877.0,35.0,6.57


In [5]:
# Create a new DataFrame for the SSS data.
SSS_df = combined_df[['SSS Depth', 'SSS Type']].copy()
SSS_df

Unnamed: 0_level_0,SSS Depth,SSS Type
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania_1990,2,"Universal medical benefits,Social insurance sy..."
Albania_1991,2,"Universal medical benefits,Social insurance sy..."
Albania_1992,2,"Universal medical benefits,Social insurance sy..."
Albania_1993,2,"Universal medical benefits,Social insurance sy..."
Albania_1994,2,"Universal medical benefits,Social insurance sy..."
...,...,...
Zimbabwe_2018,1,Employer-liability system (cash sickness benef...
Zimbabwe_2019,1,Employer-liability system (cash sickness benef...
Zimbabwe_2020,1,Employer-liability system (cash sickness benef...
Zimbabwe_2021,1,Employer-liability system (cash sickness benef...


In [6]:
#create a list of SSS component phrases to match
component_terms = ["community-based social insurance","employer-liability system (cash benefits)","employer-liability system (cash maternity benefits)","employer-liability system (cash medical benefits)","employer-liability system (cash sickness benefits)","employer-liability system (maternity benefits)","employer-liability system (medical benefits)","employer-liability system (parental leave)","employer-liability system (paternity benefits)","employer-liability system (paternity leave)","employer-liability system (sickness benefits)","employer-liability system through private carriers","employer-liability system","employment-related system (cash benefits)","employment-related system (cash maternity benefits)","employment-related system (cash sickness benefits)","employment-related system (parental benefits)","employment-related system (tuberculosis benefits)","mandatory health insurance (medical benefits)","mandatory health insurance through private carriers (medical benefits)","mandatory individual account system (medical benefits)","mandatory private insurance (cash benefits)","mandatory private insurance (medical benefits)","mandatory private insurance","mandatory private pension scheme","open pension funds","provident fund (birth grant)","provident fund (medical benefits)","Retirement pension scheme","social assistance system (birth grant)","social assistance system (cash benefits)","social assistance system (cash birth grants)","social assistance system (cash maternity benefits)","social assistance system (cash sickness benefits)","social assistance system (funeral grant)","social assistance system (long-term care benefits)","social assistance system (maternity benefits)","social assistance system (medical benefits)","social assistance system (parental leave)","social assistance system","social insurance system (cash benefits)","social insurance system (cash maternity benefits)","social insurance system (cash medical benefits)","social insurance system (cash parental benefits)","social insurance system (cash sickness benefits)","social insurance system (cash sickness)","social insurance system (child care benefits)","social insurance system (in-kind benefits)","social insurance system (maternity benefits)","social insurance system (maternity medical benefits)","social insurance system (medical benefits)","social insurance system (prenatal benefits)","social insurance system","universal (adoption grant)","universal (birth grant)","universal (cash maternity benefits)","universal (cash parental grants)","universal (paid parental leave)","universal (parental benefits)","universal (pregnancy grant)","universal (prenatal care grant)","universal medical benefits","voluntary community-based social insurance system (medical benefits)","voluntary pension scheme"]

In [7]:
# Create a regex pattern from the list of components that will match across word boundaries
regex_pattern = r'(' + '|'.join(map(re.escape, sorted(component_terms, key=len, reverse=True))) + r')'

# Ensure the pattern is compiled for efficiency
compiled_pattern = re.compile(regex_pattern, flags=re.IGNORECASE)

# Function to match components using regex
def find_components(system_type, pattern):
    # Find all matches of the pattern within the system_type string
    matches = re.findall(pattern, system_type)
    return list(set(matches))  # return unique matches only

# Apply this function to the 'system_type' column
SSS_df['components_matched'] = SSS_df['SSS Type'].apply(lambda x: find_components(x, compiled_pattern))
SSS_df

Unnamed: 0_level_0,SSS Depth,SSS Type,components_matched
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Albania_1990,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ..."
Albania_1991,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ..."
Albania_1992,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ..."
Albania_1993,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ..."
Albania_1994,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ..."
...,...,...,...
Zimbabwe_2018,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...
Zimbabwe_2019,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...
Zimbabwe_2020,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...
Zimbabwe_2021,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...


In [8]:
# components_matched contains lists of matched SSS component texts
components_df = SSS_df['components_matched'].str.join('|').str.get_dummies('|')

# Concatenate with original df
encoded_df = pd.concat([SSS_df, components_df], axis=1)


In [9]:
encoded_df.head(2)

Unnamed: 0_level_0,SSS Depth,SSS Type,components_matched,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),...,Universal (birth grant),Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Albania_1991,2,"Universal medical benefits,Social insurance sy...","[Social insurance system (cash benefits), Univ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# Drop the text columns
encoded_df = encoded_df.drop(['SSS Type','components_matched'], axis=1)
encoded_df.head(2)

Unnamed: 0_level_0,SSS Depth,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),Employer-liability system (medical benefits),Employer-liability system (parental leave),...,Universal (birth grant),Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Albania_1991,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
# Collect the column labels so they can be reapplied after data scaling
encoded_col_names = encoded_df.columns.tolist()

In [12]:
# Standardize the data with MaxAbsScaler(). Using this scaler to maintain integrity of sparse encoded values.
scaler = MaxAbsScaler()

encode_scaled_nda = scaler.fit_transform(encoded_df)
encode_scaled_nda

array([[0.4, 0. , 0. , ..., 1. , 0. , 0. ],
       [0.4, 0. , 0. , ..., 1. , 0. , 0. ],
       [0.4, 0. , 0. , ..., 1. , 0. , 0. ],
       ...,
       [0.2, 0. , 0. , ..., 0. , 0. , 0. ],
       [0.2, 0. , 0. , ..., 0. , 0. , 0. ],
       [0.2, 0. , 0. , ..., 0. , 0. , 0. ]])

In [13]:
# Convert the scaled-encoded data back to a DataFrame (nda = Numpy Data Array)
sss_se_df = pd.DataFrame(encode_scaled_nda, index=encoded_df.index)

# Apply the column labels to ensure the data is properly identified
sss_se_df = sss_se_df.set_axis(encoded_col_names, axis=1)
sss_se_df

Unnamed: 0_level_0,SSS Depth,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),Employer-liability system (medical benefits),Employer-liability system (parental leave),...,Universal (birth grant),Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1991,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1992,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1993,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1994,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2019,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2020,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2021,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# define the model
SSS_model = HDBSCAN(n_jobs=-1)

# Fit the model - Make predictions
SSS_pred = SSS_model.fit_predict(sss_se_df)

# Add the predicted class columns
sss_se_df['SSS Class'] = SSS_model.labels_

In [15]:
sss_se_df

Unnamed: 0_level_0,SSS Depth,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),Employer-liability system (medical benefits),Employer-liability system (parental leave),...,Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme,SSS Class
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1991,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1992,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1993,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1994,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77
Zimbabwe_2019,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77
Zimbabwe_2020,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77
Zimbabwe_2021,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77


In [29]:
# Create a dictionary to send to MongoDB - Alternate method, not necessary when using PyMongoArrow
#sss_dict = sss_se_df.to_dict('records')
#sss_dict

In [30]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get a reference to the data collection
SSS_data = db['SSS_Clusters']

# write the data to the database (alternate method)
#SSS_data.insert_many(sss_dict)

In [28]:
# prefered method - use PyMongoArrow
write(db.SSS_Clusters, sss_se_df)

  if _pandas_api.is_sparse(col):


{'insertedCount': 6105}

In [None]:
# Add the SSS_Class to the original dataframe
combined_df['SSS Class'] = sss_se_df['SSS_class']
combined_df