In [1]:
# Import the dependencies
import os
from dotenv import load_dotenv
import pymongo as pym
from pymongo import MongoClient
import pymongoarrow as pma
from pymongoarrow.api import write
import pandas as pd

import re
from sklearn.cluster import HDBSCAN
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import sweetviz as sv

In [2]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get a reference to the data collection
data = db['ELR_Data']

In [3]:
# Create a dataframe from the collection
combined_df = pd.DataFrame(list(data.find()))
combined_df.head(5)

Unnamed: 0,_id,Country_Year,Country,Year,Gov Type,SSS Depth,SSS Type,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),...,Current HE per capita (PPP Intl $),Capital HE (% GDP),Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters)
0,656c8a4891b27078d5aba381,Albania_1990,Albania,1990,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.946788,3.286542,...,222.786533,0.005188,2.765835,103.662764,115.435286,-11.187905,2549.746801,2568.0,35.0,6.57
1,656c8a4891b27078d5aba382,Albania_1991,Albania,1991,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.225912,3.26679,...,222.786533,0.005188,2.765835,103.662764,115.435286,-27.566821,1909.31916,2572.0,35.0,6.57
2,656c8a4891b27078d5aba383,Albania_1992,Albania,1992,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,118.505073,3.247039,...,222.786533,0.005188,2.765835,103.662764,115.435286,-6.622551,1823.503609,2654.0,35.0,6.57
3,656c8a4891b27078d5aba384,Albania_1993,Albania,1993,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.784197,3.227287,...,222.786533,0.005188,2.765835,103.662764,115.435286,10.229949,2057.692048,2795.0,35.0,6.57
4,656c8a4891b27078d5aba385,Albania_1994,Albania,1994,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.063358,3.207536,...,222.786533,0.005188,2.765835,103.662764,115.435286,8.969762,2290.143917,2877.0,35.0,6.57


In [4]:
# Drop the database id data and refresh the index
combined_df = combined_df.drop(['_id'], axis=1)
combined_df = combined_df.reset_index(drop=True)

# Create a new DataFrame for the country_years.  Apply the country_years as the new index for later merging.
country_year_df = pd.DataFrame()
country_year_df['Country_Year'] = combined_df['Country_Year']
country_year_df['Country'] = combined_df['Country']
country_year_df['Year'] = combined_df['Year']
country_year_df = country_year_df.set_index('Country_Year', drop=False)

# Set the DataFrame index to the country names to get them out of the way
combined_df= combined_df.set_index('Country_Year')
ori_df = combined_df.copy()
combined_df.head(5)

Unnamed: 0_level_0,Country,Year,Gov Type,SSS Depth,SSS Type,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),Total Labor Force (M),LEx years,...,Current HE per capita (PPP Intl $),Capital HE (% GDP),Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters)
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,Albania,1990,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.946788,3.286542,1.374478,73.144,...,222.786533,0.005188,2.765835,103.662764,115.435286,-11.187905,2549.746801,2568.0,35.0,6.57
Albania_1991,Albania,1991,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.225912,3.26679,1.429833,73.378,...,222.786533,0.005188,2.765835,103.662764,115.435286,-27.566821,1909.31916,2572.0,35.0,6.57
Albania_1992,Albania,1992,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,118.505073,3.247039,1.438342,73.715,...,222.786533,0.005188,2.765835,103.662764,115.435286,-6.622551,1823.503609,2654.0,35.0,6.57
Albania_1993,Albania,1993,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.784197,3.227287,1.413557,73.939,...,222.786533,0.005188,2.765835,103.662764,115.435286,10.229949,2057.692048,2795.0,35.0,6.57
Albania_1994,Albania,1994,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.063358,3.207536,1.391914,74.131,...,222.786533,0.005188,2.765835,103.662764,115.435286,8.969762,2290.143917,2877.0,35.0,6.57


In [5]:
# Create a new DataFrame for the SSS data.
SSS_df = combined_df[['SSS Depth', 'SSS Type']].copy()
SSS_df

Unnamed: 0_level_0,SSS Depth,SSS Type
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania_1990,2,"Universal medical benefits,Social insurance sy..."
Albania_1991,2,"Universal medical benefits,Social insurance sy..."
Albania_1992,2,"Universal medical benefits,Social insurance sy..."
Albania_1993,2,"Universal medical benefits,Social insurance sy..."
Albania_1994,2,"Universal medical benefits,Social insurance sy..."
...,...,...
Zimbabwe_2018,1,Employer-liability system (cash sickness benef...
Zimbabwe_2019,1,Employer-liability system (cash sickness benef...
Zimbabwe_2020,1,Employer-liability system (cash sickness benef...
Zimbabwe_2021,1,Employer-liability system (cash sickness benef...


In [6]:
#create a list of SSS component phrases to match
sss_component_terms = ["community-based social insurance","employer-liability system (cash benefits)","employer-liability system (cash maternity benefits)","employer-liability system (cash medical benefits)","employer-liability system (cash sickness benefits)","employer-liability system (maternity benefits)","employer-liability system (medical benefits)","employer-liability system (parental leave)","employer-liability system (paternity benefits)","employer-liability system (paternity leave)","employer-liability system (sickness benefits)","employer-liability system through private carriers","employer-liability system","employment-related system (cash benefits)","employment-related system (cash maternity benefits)","employment-related system (cash sickness benefits)","employment-related system (parental benefits)","employment-related system (tuberculosis benefits)","mandatory health insurance (medical benefits)","mandatory health insurance through private carriers (medical benefits)","mandatory individual account system (medical benefits)","mandatory private insurance (cash benefits)","mandatory private insurance (medical benefits)","mandatory private insurance","mandatory private pension scheme","open pension funds","provident fund (birth grant)","provident fund (medical benefits)","Retirement pension scheme","social assistance system (birth grant)","social assistance system (cash benefits)","social assistance system (cash birth grants)","social assistance system (cash maternity benefits)","social assistance system (cash sickness benefits)","social assistance system (funeral grant)","social assistance system (long-term care benefits)","social assistance system (maternity benefits)","social assistance system (medical benefits)","social assistance system (parental leave)","social assistance system","social insurance system (cash benefits)","social insurance system (cash maternity benefits)","social insurance system (cash medical benefits)","social insurance system (cash parental benefits)","social insurance system (cash sickness benefits)","social insurance system (cash sickness)","social insurance system (child care benefits)","social insurance system (in-kind benefits)","social insurance system (maternity benefits)","social insurance system (maternity medical benefits)","social insurance system (medical benefits)","social insurance system (prenatal benefits)","social insurance system","universal (adoption grant)","universal (birth grant)","universal (cash maternity benefits)","universal (cash parental grants)","universal (paid parental leave)","universal (parental benefits)","universal (pregnancy grant)","universal (prenatal care grant)","universal medical benefits","voluntary community-based social insurance system (medical benefits)","voluntary pension scheme"]

In [8]:
# Create a regex pattern from the list of components that will match across word boundaries
regex_pattern = r'(' + '|'.join(map(re.escape, sorted(sss_component_terms, key=len, reverse=True))) + r')'

# Ensure the pattern is compiled for efficiency
compiled_pattern = re.compile(regex_pattern, flags=re.IGNORECASE)

# Function to match components using regex
def find_components(government_type, pattern):
    # Find all matches of the pattern within the government_type string
    matches = re.findall(pattern, government_type)
    return list(set(matches))  # return unique matches only

# Apply this function to the 'system_type' column
SSS_df['components_matched'] = SSS_df['SSS Type'].apply(lambda x: find_components(x, compiled_pattern))
SSS_df

Unnamed: 0_level_0,SSS Depth,SSS Type,components_matched
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Albania_1990,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ..."
Albania_1991,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ..."
Albania_1992,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ..."
Albania_1993,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ..."
Albania_1994,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ..."
...,...,...,...
Zimbabwe_2018,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...
Zimbabwe_2019,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...
Zimbabwe_2020,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...
Zimbabwe_2021,1,Employer-liability system (cash sickness benef...,[Employer-liability system (maternity benefits...


In [9]:
# components_matched contains lists of matched SSS component texts
sss_components_df = SSS_df['components_matched'].str.join('|').str.get_dummies('|')

# Concatenate with original df
sss_encoded_df = pd.concat([SSS_df, sss_components_df], axis=1)


In [10]:
sss_encoded_df.head(2)

Unnamed: 0_level_0,SSS Depth,SSS Type,components_matched,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),...,Universal (birth grant),Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Albania_1991,2,"Universal medical benefits,Social insurance sy...","[Universal medical benefits, Social insurance ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
# Drop the text columns
sss_encoded_df = sss_encoded_df.drop(['SSS Type','components_matched'], axis=1)
sss_encoded_df.head(2)

Unnamed: 0_level_0,SSS Depth,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),Employer-liability system (medical benefits),Employer-liability system (parental leave),...,Universal (birth grant),Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Albania_1991,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
# Collect the column labels so they can be reapplied after data scaling
sss_encoded_col_names = sss_encoded_df.columns.tolist()

In [13]:
# Standardize the data with MaxAbsScaler(). Using this scaler to maintain integrity of sparse encoded values.
scaler = MaxAbsScaler()

sss_encode_scaled_nda = scaler.fit_transform(sss_encoded_df)
sss_encode_scaled_nda

array([[0.4, 0. , 0. , ..., 1. , 0. , 0. ],
       [0.4, 0. , 0. , ..., 1. , 0. , 0. ],
       [0.4, 0. , 0. , ..., 1. , 0. , 0. ],
       ...,
       [0.2, 0. , 0. , ..., 0. , 0. , 0. ],
       [0.2, 0. , 0. , ..., 0. , 0. , 0. ],
       [0.2, 0. , 0. , ..., 0. , 0. , 0. ]])

In [14]:
# Convert the scaled-encoded data back to a DataFrame (nda = Numpy Data Array)
sss_se_df = pd.DataFrame(sss_encode_scaled_nda, index=sss_encoded_df.index)

# Apply the column labels to ensure the data is properly identified
sss_se_df = sss_se_df.set_axis(sss_encoded_col_names, axis=1)
sss_se_df

Unnamed: 0_level_0,SSS Depth,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),Employer-liability system (medical benefits),Employer-liability system (parental leave),...,Universal (birth grant),Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1991,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1992,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1993,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Albania_1994,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2019,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2020,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2021,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# define the model
SSS_model = HDBSCAN(n_jobs=-1)

# Fit the model - Make predictions
SSS_pred = SSS_model.fit_predict(sss_se_df)

# Add the predicted class columns
sss_se_df['SSS Class'] = SSS_model.labels_

In [16]:
sss_se_df

Unnamed: 0_level_0,SSS Depth,Community-based social insurance,Employer-liability system,Employer-liability system (cash benefits),Employer-liability system (cash maternity benefits),Employer-liability system (cash medical benefits),Employer-liability system (cash sickness benefits),Employer-liability system (maternity benefits),Employer-liability system (medical benefits),Employer-liability system (parental leave),...,Universal (cash maternity benefits),Universal (cash parental grants),Universal (paid parental leave),Universal (parental benefits),Universal (pregnancy grant),Universal (prenatal care grant),Universal medical benefits,Voluntary community-based social insurance system (medical benefits),Voluntary pension scheme,SSS Class
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1991,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1992,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1993,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
Albania_1994,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77
Zimbabwe_2019,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77
Zimbabwe_2020,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77
Zimbabwe_2021,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77


In [None]:
# Create a dictionary to send to MongoDB - Alternate method, not necessary when using PyMongoArrow
#sss_dict = sss_se_df.to_dict('records')
#sss_dict

In [None]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get a reference to the data collection
SSS_data = db['SSS_Clusters']

# write the data to the database (alternate method)
#SSS_data.insert_many(sss_dict)

In [None]:
# prefered method - use PyMongoArrow
write(db.SSS_Clusters, sss_se_df)

In [17]:
# Create a new dataframe for the Government system data
gov_df = combined_df[['Gov Type']].copy()
gov_df

Unnamed: 0_level_0,Gov Type
Country_Year,Unnamed: 1_level_1
Albania_1990,parliamentary republic
Albania_1991,parliamentary republic
Albania_1992,parliamentary republic
Albania_1993,parliamentary republic
Albania_1994,parliamentary republic
...,...
Zimbabwe_2018,presidential republic
Zimbabwe_2019,presidential republic
Zimbabwe_2020,presidential republic
Zimbabwe_2021,presidential republic


In [18]:
#create a list of component phrases to match
component_terms = ['absolute monarchy or sultanate','absolute monarchy','communist party-led state','communist state','constitutional federal republic','constitutional monarchy','dictatorship','federal parliamentary constitutional monarchy','federal parliamentary democracy under a constitutional monarchy','federal parliamentary republic','federal presidential republic','federal republic','in transition','mixed presidential-parliamentary system','parliamentary constitutional monarchy','parliamentary democracy under a constitutional monarchy','parliamentary democracy with limited self-government','parliamentary democracy','parliamentary republic','presidential limited democracy','presidential republic','previously parliamentary republic','semi-presidential federation','semi-presidential republic','theocratic republic','unitary parliamentary republic']

In [19]:
# Create a regex pattern from the list of components that will match across word boundaries
regex_pattern = r'(' + '|'.join(map(re.escape, sorted(component_terms, key=len, reverse=True))) + r')'

# Ensure the pattern is compiled for efficiency
compiled_pattern = re.compile(regex_pattern, flags=re.IGNORECASE)

# Function to match components using regex
def find_components(system_type, pattern):
    # Find all matches of the pattern within the system_type string
    matches = re.findall(pattern, system_type)
    return list(set(matches))  # return unique matches only

# Apply this function to the 'system_type' column
gov_df['components_matched'] = gov_df['Gov Type'].apply(lambda x: find_components(x, compiled_pattern))
gov_df

Unnamed: 0_level_0,Gov Type,components_matched
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania_1990,parliamentary republic,[parliamentary republic]
Albania_1991,parliamentary republic,[parliamentary republic]
Albania_1992,parliamentary republic,[parliamentary republic]
Albania_1993,parliamentary republic,[parliamentary republic]
Albania_1994,parliamentary republic,[parliamentary republic]
...,...,...
Zimbabwe_2018,presidential republic,[presidential republic]
Zimbabwe_2019,presidential republic,[presidential republic]
Zimbabwe_2020,presidential republic,[presidential republic]
Zimbabwe_2021,presidential republic,[presidential republic]


In [20]:
# components_matched contains lists of matched gov component texts
gov_components_df = gov_df['components_matched'].str.join('|').str.get_dummies('|')

# Concatenate with original df
gov_encoded_df = pd.concat([gov_df, gov_components_df], axis=1)

In [21]:
# Drop the text columns
gov_encoded_df = gov_encoded_df.drop(['Gov Type','components_matched'], axis=1)
gov_encoded_df

Unnamed: 0_level_0,absolute monarchy,absolute monarchy or sultanate,communist party-led state,communist state,constitutional federal republic,constitutional monarchy,dictatorship,federal parliamentary constitutional monarchy,federal parliamentary democracy under a constitutional monarchy,federal parliamentary republic,...,parliamentary democracy under a constitutional monarchy,parliamentary democracy with limited self-government,parliamentary republic,presidential limited democracy,presidential republic,previously parliamentary republic,semi-presidential federation,semi-presidential republic,theocratic republic,unitary parliamentary republic
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Albania_1991,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Albania_1992,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Albania_1993,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Albania_1994,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Zimbabwe_2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Zimbabwe_2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Zimbabwe_2021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [22]:
# Collect the column labels so they can be reapplied after data scaling
gov_encoded_col_names = gov_encoded_df.columns.tolist()

In [23]:
# Standardize the data with MaxAbsScaler(). Using this scaler to maintain integrity of sparse encoded values.
gov_scale = MaxAbsScaler()

gov_encoded_scaled_nda = gov_scale.fit_transform(gov_encoded_df)
gov_encoded_scaled_nda

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
# Convert the scaled-encoded data back to a DataFrame (nda = Numpy Data Array)
gov_s_e_df = pd.DataFrame(gov_encoded_scaled_nda, index=gov_encoded_df.index)

# Apply the column labels to ensure the data is properly identified
gov_s_e_df = gov_s_e_df.set_axis(gov_encoded_col_names, axis=1)
gov_s_e_df

Unnamed: 0_level_0,absolute monarchy,absolute monarchy or sultanate,communist party-led state,communist state,constitutional federal republic,constitutional monarchy,dictatorship,federal parliamentary constitutional monarchy,federal parliamentary democracy under a constitutional monarchy,federal parliamentary republic,...,parliamentary democracy under a constitutional monarchy,parliamentary democracy with limited self-government,parliamentary republic,presidential limited democracy,presidential republic,previously parliamentary republic,semi-presidential federation,semi-presidential republic,theocratic republic,unitary parliamentary republic
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Albania_1991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Albania_1992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Albania_1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Albania_1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Zimbabwe_2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# define the model
gov_model = HDBSCAN(n_jobs=-1)

# Fit the model - Make predictions
gov_pred = gov_model.fit_predict(gov_s_e_df)

# Add the predicted class columns
gov_s_e_df['Gov Class'] = gov_model.labels_

In [26]:
gov_s_e_df

Unnamed: 0_level_0,absolute monarchy,absolute monarchy or sultanate,communist party-led state,communist state,constitutional federal republic,constitutional monarchy,dictatorship,federal parliamentary constitutional monarchy,federal parliamentary democracy under a constitutional monarchy,federal parliamentary republic,...,parliamentary democracy with limited self-government,parliamentary republic,presidential limited democracy,presidential republic,previously parliamentary republic,semi-presidential federation,semi-presidential republic,theocratic republic,unitary parliamentary republic,Gov Class
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24
Albania_1991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24
Albania_1992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24
Albania_1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24
Albania_1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14
Zimbabwe_2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14
Zimbabwe_2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14
Zimbabwe_2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14


In [None]:
# load the config from the .env file
load_dotenv()
MONGODB_URI = os.environ['MONGODB_URI']

# Connect to the database engine
client = MongoClient(MONGODB_URI)

# connect to the project db
db = client['ExpectLifeRedux']

# get a reference to the data collection
gov_data = db['Gov_Clusters']

# write the data to the database (alternate method)
#SSS_data.insert_many(sss_dict)

In [None]:
# prefered method - use PyMongoArrow
write(db.Gov_Clusters, gov_s_e_df)

In [27]:
# Add the cluster data to the original data set
combined_df['SSS Class'] = sss_se_df['SSS Class']
combined_df['Gov Class'] = gov_s_e_df['Gov Class']
combined_df

Unnamed: 0_level_0,Country,Year,Gov Type,SSS Depth,SSS Type,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),Total Labor Force (M),LEx years,...,Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters),SSS Class,Gov Class
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,Albania,1990,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.946788,3.286542,1.374478,73.144,...,2.765835,103.662764,115.435286,-11.187905,2549.746801,2568.0,35.0,6.57,81,24
Albania_1991,Albania,1991,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,119.225912,3.266790,1.429833,73.378,...,2.765835,103.662764,115.435286,-27.566821,1909.319160,2572.0,35.0,6.57,81,24
Albania_1992,Albania,1992,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,118.505073,3.247039,1.438342,73.715,...,2.765835,103.662764,115.435286,-6.622551,1823.503609,2654.0,35.0,6.57,81,24
Albania_1993,Albania,1993,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.784197,3.227287,1.413557,73.939,...,2.765835,103.662764,115.435286,10.229949,2057.692048,2795.0,35.0,6.57,81,24
Albania_1994,Albania,1994,parliamentary republic,2,"Universal medical benefits,Social insurance sy...",1485.0,117.063358,3.207536,1.391914,74.131,...,2.765835,103.662764,115.435286,8.969762,2290.143917,2877.0,35.0,6.57,81,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,Zimbabwe,2018,presidential republic,1,Employer-liability system (cash sickness benef...,657.0,38.909614,15.052184,5.770226,61.414,...,1.594279,61.475349,64.993823,2.909395,2457.309859,1908.0,12.1,4.67,77,14
Zimbabwe_2019,Zimbabwe,2019,presidential republic,1,Employer-liability system (cash sickness benef...,657.0,39.691374,15.354608,5.912685,61.292,...,0.522077,18.839995,31.431890,-8.177320,2296.845429,1908.0,12.1,4.67,77,14
Zimbabwe_2020,Zimbabwe,2020,presidential republic,1,Employer-liability system (cash sickness benef...,657.0,40.505793,15.669666,6.008633,61.124,...,0.756783,26.096745,26.263578,-9.670405,2101.804597,1908.0,11.7,4.67,77,14
Zimbabwe_2021,Zimbabwe,2021,presidential republic,1,Employer-liability system (cash sickness benef...,657.0,40.505793,15.993524,6.200666,59.253,...,0.756783,26.096745,26.263578,6.271613,2333.973632,1908.0,11.7,4.67,77,14


In [28]:
# Drop the text and extra colums
combined_df = combined_df.drop(['Country', 'Year', 'Gov Type', 'SSS Type'], axis=1)
combined_df

Unnamed: 0_level_0,SSS Depth,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),Total Labor Force (M),LEx years,Doctors (#/10k pop),Electricity Access (% Pop),Current HE (% GDP),Current HE per capita (PPP Intl $),...,Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters),SSS Class,Gov Class
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,2,1485.0,119.946788,3.286542,1.374478,73.144,13.71,100.000000,5.944186,222.786533,...,2.765835,103.662764,115.435286,-11.187905,2549.746801,2568.0,35.0,6.57,81,24
Albania_1991,2,1485.0,119.225912,3.266790,1.429833,73.378,14.52,100.000000,5.944186,222.786533,...,2.765835,103.662764,115.435286,-27.566821,1909.319160,2572.0,35.0,6.57,81,24
Albania_1992,2,1485.0,118.505073,3.247039,1.438342,73.715,15.93,100.000000,5.944186,222.786533,...,2.765835,103.662764,115.435286,-6.622551,1823.503609,2654.0,35.0,6.57,81,24
Albania_1993,2,1485.0,117.784197,3.227287,1.413557,73.939,13.68,100.000000,5.944186,222.786533,...,2.765835,103.662764,115.435286,10.229949,2057.692048,2795.0,35.0,6.57,81,24
Albania_1994,2,1485.0,117.063358,3.207536,1.391914,74.131,12.85,100.000000,5.944186,222.786533,...,2.765835,103.662764,115.435286,8.969762,2290.143917,2877.0,35.0,6.57,81,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,1,657.0,38.909614,15.052184,5.770226,61.414,2.01,45.400288,4.670418,180.091128,...,1.594279,61.475349,64.993823,2.909395,2457.309859,1908.0,12.1,4.67,77,14
Zimbabwe_2019,1,657.0,39.691374,15.354608,5.912685,61.292,1.95,46.682095,3.659765,132.068691,...,0.522077,18.839995,31.431890,-8.177320,2296.845429,1908.0,12.1,4.67,77,14
Zimbabwe_2020,1,657.0,40.505793,15.669666,6.008633,61.124,1.89,52.747667,3.425581,118.127063,...,0.756783,26.096745,26.263578,-9.670405,2101.804597,1908.0,11.7,4.67,77,14
Zimbabwe_2021,1,657.0,40.505793,15.993524,6.200666,59.253,1.89,48.979927,3.425581,118.127063,...,0.756783,26.096745,26.263578,6.271613,2333.973632,1908.0,11.7,4.67,77,14


### Clustering Trial ###

In [29]:
cluster_df = combined_df.copy()

In [30]:
# Collect the column labels so they can be reapplied after data scaling
cluster_names = cluster_df.columns.tolist()

In [31]:
# Standardize the data with MaxAbsScaler(). Using this scaler to maintain integrity of sparse encoded values.
cluster_scale = MaxAbsScaler()

cluster_scaled_nda = cluster_scale.fit_transform(cluster_df)
cluster_scaled_nda

array([[0.4       , 0.45833333, 0.00655855, ..., 0.3204878 , 0.92045455,
        0.96      ],
       [0.4       , 0.45833333, 0.00651914, ..., 0.3204878 , 0.92045455,
        0.96      ],
       [0.4       , 0.45833333, 0.00647972, ..., 0.3204878 , 0.92045455,
        0.96      ],
       ...,
       [0.2       , 0.20277778, 0.00221481, ..., 0.22780488, 0.875     ,
        0.56      ],
       [0.2       , 0.20277778, 0.00221481, ..., 0.22780488, 0.875     ,
        0.56      ],
       [0.2       , 0.20277778, 0.00221481, ..., 0.22780488, 0.875     ,
        0.56      ]])

In [32]:
# Convert the scaled data back to a DataFrame (nda = Numpy Data Array)
cluster_s_df = pd.DataFrame(cluster_scaled_nda, index=cluster_df.index)

# Apply the column labels to ensure the data is properly identified
cluster_s_df = cluster_s_df.set_axis(cluster_names, axis=1)
cluster_s_df

Unnamed: 0_level_0,SSS Depth,Avg Rainfall (mm/yr),Pop Density (#/sq km),Total Population (M),Total Labor Force (M),LEx years,Doctors (#/10k pop),Electricity Access (% Pop),Current HE (% GDP),Current HE per capita (PPP Intl $),...,Domestic General Gov HE (% GDP),Domestic General Gov HE per capita (PPP Intl $),Domestic Private HE per capita (PPP Intl $),GDP growth per capita %,GDP per capita (PPP Intl $),Daily Caloric Supply,Tobacco use (% adults),Alcohol Use per capita (liters),SSS Class,Gov Class
Country_Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albania_1990,0.4,0.458333,0.006559,0.002319,0.001758,0.855510,0.162691,1.000000,0.291190,0.019038,...,0.204715,0.015604,0.020498,-0.079641,0.015622,0.661004,0.510949,0.320488,0.920455,0.96
Albania_1991,0.4,0.458333,0.006519,0.002305,0.001829,0.858247,0.172303,1.000000,0.291190,0.019038,...,0.204715,0.015604,0.020498,-0.196233,0.011698,0.662033,0.510949,0.320488,0.920455,0.96
Albania_1992,0.4,0.458333,0.006480,0.002291,0.001840,0.862188,0.189035,1.000000,0.291190,0.019038,...,0.204715,0.015604,0.020498,-0.047142,0.011172,0.683140,0.510949,0.320488,0.920455,0.96
Albania_1993,0.4,0.458333,0.006440,0.002277,0.001808,0.864808,0.162335,1.000000,0.291190,0.019038,...,0.204715,0.015604,0.020498,0.072821,0.012607,0.719434,0.510949,0.320488,0.920455,0.96
Albania_1994,0.4,0.458333,0.006401,0.002263,0.001780,0.867054,0.152486,1.000000,0.291190,0.019038,...,0.204715,0.015604,0.020498,0.063851,0.014031,0.740541,0.510949,0.320488,0.920455,0.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe_2018,0.2,0.202778,0.002128,0.010621,0.007380,0.718313,0.023852,0.454003,0.228792,0.015389,...,0.118002,0.009254,0.011541,0.020710,0.015055,0.491120,0.176642,0.227805,0.875000,0.56
Zimbabwe_2019,0.2,0.202778,0.002170,0.010835,0.007563,0.716886,0.023140,0.466821,0.179282,0.011286,...,0.038642,0.002836,0.005581,-0.058210,0.014072,0.491120,0.176642,0.227805,0.875000,0.56
Zimbabwe_2020,0.2,0.202778,0.002215,0.011057,0.007685,0.714921,0.022428,0.527477,0.167810,0.010094,...,0.056014,0.003928,0.004664,-0.068838,0.012877,0.491120,0.170803,0.227805,0.875000,0.56
Zimbabwe_2021,0.2,0.202778,0.002215,0.011286,0.007931,0.693037,0.022428,0.489799,0.167810,0.010094,...,0.056014,0.003928,0.004664,0.044644,0.014300,0.491120,0.170803,0.227805,0.875000,0.56


In [33]:
# define the model
cluster_model = HDBSCAN(n_jobs=-1)

# Fit the model - Make predictions
cluster_pred = cluster_model.fit_predict(cluster_s_df)

# Add the predicted class columns
cluster_s_df['Cluster Label'] = cluster_model.labels_

In [None]:
# Generate a sweetviz analysis report
cluster_report = sv.analyze(cluster_s_df)

cluster_report.show_html(filepath='Cluster_HDBSCAN-1.html', layout='vertical')

### Supervised Trial ###

In [36]:
supervised_df = combined_df.copy()

In [37]:
# Create our features
X = supervised_df.drop(['LEx years'], axis=1)

# Create our target
y = supervised_df['LEx years']

In [38]:
# Separate the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
# scale the data to eliminate bias
scaler = MaxAbsScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Do the scaling
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
# Instantiate the model
rfr_model = RandomForestRegressor(n_jobs=-1, random_state=42)

# Fit the model
rfr_model.fit(X_train_scaled, y_train)

# Make predictions with the test data
pred = rfr_model.predict(X_test_scaled)

In [41]:
# Get the score
rfr_model.score(X, y)



-0.35116268564657616

In [42]:
# List the features sorted in descending order by feature importance
# Calculate the feature importance
importances = rfr_model.feature_importances_

# Sort the features by their importance
sorted(zip(rfr_model.feature_importances_, X.columns), reverse=True)

[(0.25833557672914614, 'Electricity Access (% Pop)'),
 (0.17390285804539196, 'Total Population (M)'),
 (0.1448246101197424, 'GDP per capita (PPP Intl $)'),
 (0.0939981057992943, 'Pop Density (#/sq km)'),
 (0.09095324809964955, 'Domestic Private HE per capita (PPP Intl $)'),
 (0.05716629610139142, 'Domestic General Gov HE per capita (PPP Intl $)'),
 (0.0569540915770096, 'Current HE per capita (PPP Intl $)'),
 (0.029534356735232394, 'Doctors (#/10k pop)'),
 (0.01663069603256838, 'Alcohol Use per capita (liters)'),
 (0.015164333300203432, 'SSS Class'),
 (0.013122733064003403, 'Domestic General Gov HE (% GDP)'),
 (0.010870586329556777, 'Gov Class'),
 (0.008525262480473727, 'Current HE (% GDP)'),
 (0.007442603546652017, 'Tobacco use (% adults)'),
 (0.004987290410624012, 'Avg Rainfall (mm/yr)'),
 (0.004213888568505164, 'Daily Caloric Supply'),
 (0.004207281946164564, 'Total Labor Force (M)'),
 (0.00402041176010319, 'Capital HE (% GDP)'),
 (0.0028402978553303563, 'GDP growth per capita %'),
 