In [11]:
# Import our dependencies
import numpy as np
import pandas as pd
import hvplot.pandas
import plotly.express as px
from collections import Counter

from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


# Note:  SSS Data has been standardized and formatted into lists

In [2]:
#  Import and read the charity_data.csv
systems_df = pd.read_csv("../Clean_Data/transformed/country_social_security_systems_transform-expand.csv")

# Drop the unnamed:0 column
systems_df = systems_df.drop(['Unnamed: 0'], axis=1)

systems_df

Unnamed: 0,Albania_CSS,Algeria_CSS,Andorra_CSS,Angola_CSS,Antigua and Barbuda_CSS,Argentina_CSS,Armenia_CSS,Aruba_CSS,Australia_CSS,Austria_CSS,...,Ukraine_CSS,United Kingdom_CSS,United States of America_CSS,Uruguay_CSS,Uzbekistan_CSS,Vanuatu_CSS,Vietnam_CSS,Yemen_CSS,Zambia_CSS,Zimbabwe_CSS
0,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
1,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
2,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
3,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
4,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
5,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
6,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
7,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
8,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...
9,"['Universal medical benefits','Social insuranc...",['Social insurance system'],['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Social insurance system'],"['Universal medical benefits','Social insuranc...",['Universal (birth or adoption grant and medic...,['Social insurance system'],"['Universal medical benefits','Employment-rela...",['Social insurance system'],...,"['Universal medical benefits','Social insuranc...","['Universal medical benefits','Social insuranc...","['Social insurance','Mandatory private insuran...",['Social insurance (cash sickness and medical ...,"['Universal medical benefits','Social insuranc...",['Employer-liability system'],['Social insurance system'],"['Universal medical benefits','Employer-liabil...","['Universal medical benefits','Employer-liabil...",['Employer-liability system (cash sickness and...


In [3]:
gdp_df = pd.read_csv('../Clean_Data/dropped_empty_columns/gdp_clean_transform-dropped_empty.csv')
gdp_df

Unnamed: 0,Year,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,...,Ukraine,United Kingdom,United States of America,Uruguay,Uzbekistan,Vanuatu,Vietnam,Yemen,Zambia,Zimbabwe
0,2000,3480355000.0,54790390000.0,1429049000.0,9129595000.0,826370400.0,284203800000.0,1911564000.0,1873184000.0,415576200000.0,...,32375280000.0,1662127000000.0,10250947997000,22823260000.0,13760510000.0,272014700.0,31172520000.0,9652436000.0,3600683000.0,6689958000.0
1,2001,3922101000.0,54744710000.0,1546926000.0,8936064000.0,800481500.0,268696800000.0,2118468000.0,1896648000.0,379083900000.0,...,39309580000.0,1643908000000.0,10581929774000,20898790000.0,11401420000.0,257926900.0,32685200000.0,9861560000.0,4094481000.0,6777385000.0
2,2002,4348068000.0,56760360000.0,1755910000.0,15285590000.0,814381500.0,97724000000.0,2376335000.0,1962011000.0,395342700000.0,...,43956370000.0,1784077000000.0,10929112955000,13606490000.0,9687789000.0,262596600.0,35064110000.0,10694630000.0,4193846000.0,6342116000.0
3,2003,5611496000.0,67863830000.0,2361727000.0,17812700000.0,856396300.0,127587000000.0,2807061000.0,2044134000.0,467390800000.0,...,52010240000.0,2057094000000.0,11456442041000,12045630000.0,10134450000.0,314471300.0,39552510000.0,11777970000.0,4901840000.0,5727592000.0
4,2004,7184686000.0,85332580000.0,2894922000.0,23552050000.0,919729600.0,164657900000.0,3576615000.0,2254749000.0,614166300000.0,...,67220150000.0,2421814000000.0,12217193198000,13686330000.0,12030020000.0,364996900.0,45427850000.0,13872790000.0,6221078000.0,5805598000.0
5,2005,8052074000.0,103198200000.0,3159905000.0,36970920000.0,1022963000.0,198737100000.0,4900470000.0,2359777000.0,695075200000.0,...,89239370000.0,2544829000000.0,13039199193000,17362860000.0,14307510000.0,394962600.0,57633260000.0,16746340000.0,8331870000.0,5755215000.0
6,2006,8896073000.0,117027300000.0,3456442000.0,52381010000.0,1157663000.0,232557300000.0,6384452000.0,2469832000.0,747556200000.0,...,111884800000.0,2717060000000.0,13815586948000,19579460000.0,17330830000.0,439376800.0,66371660000.0,19061980000.0,12756860000.0,5443896000.0
7,2007,10677320000.0,134977100000.0,3952601000.0,65266450000.0,1312759000.0,287530500000.0,9206302000.0,2677654000.0,853955400000.0,...,148733900000.0,3106182000000.0,14474226905000,23410570000.0,22311390000.0,516392900.0,77414430000.0,21650530000.0,14056960000.0,5291950000.0
8,2008,12881350000.0,171000700000.0,4085631000.0,88538610000.0,1370070000.0,361558000000.0,11662040000.0,2843017000.0,1055127000000.0,...,188111100000.0,2938882000000.0,14769857911000,30366210000.0,29549440000.0,590748200.0,99130300000.0,26910850000.0,17910860000.0,4415703000.0
9,2009,12044210000.0,137211000000.0,3674410000.0,70307170000.0,1228330000.0,332976500000.0,8647937000.0,2553631000.0,928043000000.0,...,121552800000.0,2425798000000.0,14478064934000,31660910000.0,33689220000.0,592622500.0,106014700000.0,25130270000.0,15328340000.0,9665793000.0


In [4]:
# Drop the year column
gdp_df = gdp_df.drop(['Year'], axis=1)

# Add a suffix to the column names
gdp_df = gdp_df.add_suffix('_GDP')
gdp_df

Unnamed: 0,Albania_GDP,Algeria_GDP,Andorra_GDP,Angola_GDP,Antigua and Barbuda_GDP,Argentina_GDP,Armenia_GDP,Aruba_GDP,Australia_GDP,Austria_GDP,...,Ukraine_GDP,United Kingdom_GDP,United States of America_GDP,Uruguay_GDP,Uzbekistan_GDP,Vanuatu_GDP,Vietnam_GDP,Yemen_GDP,Zambia_GDP,Zimbabwe_GDP
0,3480355000.0,54790390000.0,1429049000.0,9129595000.0,826370400.0,284203800000.0,1911564000.0,1873184000.0,415576200000.0,197289600000.0,...,32375280000.0,1662127000000.0,10250947997000,22823260000.0,13760510000.0,272014700.0,31172520000.0,9652436000.0,3600683000.0,6689958000.0
1,3922101000.0,54744710000.0,1546926000.0,8936064000.0,800481500.0,268696800000.0,2118468000.0,1896648000.0,379083900000.0,197508800000.0,...,39309580000.0,1643908000000.0,10581929774000,20898790000.0,11401420000.0,257926900.0,32685200000.0,9861560000.0,4094481000.0,6777385000.0
2,4348068000.0,56760360000.0,1755910000.0,15285590000.0,814381500.0,97724000000.0,2376335000.0,1962011000.0,395342700000.0,214394900000.0,...,43956370000.0,1784077000000.0,10929112955000,13606490000.0,9687789000.0,262596600.0,35064110000.0,10694630000.0,4193846000.0,6342116000.0
3,5611496000.0,67863830000.0,2361727000.0,17812700000.0,856396300.0,127587000000.0,2807061000.0,2044134000.0,467390800000.0,262273600000.0,...,52010240000.0,2057094000000.0,11456442041000,12045630000.0,10134450000.0,314471300.0,39552510000.0,11777970000.0,4901840000.0,5727592000.0
4,7184686000.0,85332580000.0,2894922000.0,23552050000.0,919729600.0,164657900000.0,3576615000.0,2254749000.0,614166300000.0,301457600000.0,...,67220150000.0,2421814000000.0,12217193198000,13686330000.0,12030020000.0,364996900.0,45427850000.0,13872790000.0,6221078000.0,5805598000.0
5,8052074000.0,103198200000.0,3159905000.0,36970920000.0,1022963000.0,198737100000.0,4900470000.0,2359777000.0,695075200000.0,316092300000.0,...,89239370000.0,2544829000000.0,13039199193000,17362860000.0,14307510000.0,394962600.0,57633260000.0,16746340000.0,8331870000.0,5755215000.0
6,8896073000.0,117027300000.0,3456442000.0,52381010000.0,1157663000.0,232557300000.0,6384452000.0,2469832000.0,747556200000.0,336280100000.0,...,111884800000.0,2717060000000.0,13815586948000,19579460000.0,17330830000.0,439376800.0,66371660000.0,19061980000.0,12756860000.0,5443896000.0
7,10677320000.0,134977100000.0,3952601000.0,65266450000.0,1312759000.0,287530500000.0,9206302000.0,2677654000.0,853955400000.0,389185600000.0,...,148733900000.0,3106182000000.0,14474226905000,23410570000.0,22311390000.0,516392900.0,77414430000.0,21650530000.0,14056960000.0,5291950000.0
8,12881350000.0,171000700000.0,4085631000.0,88538610000.0,1370070000.0,361558000000.0,11662040000.0,2843017000.0,1055127000000.0,432051900000.0,...,188111100000.0,2938882000000.0,14769857911000,30366210000.0,29549440000.0,590748200.0,99130300000.0,26910850000.0,17910860000.0,4415703000.0
9,12044210000.0,137211000000.0,3674410000.0,70307170000.0,1228330000.0,332976500000.0,8647937000.0,2553631000.0,928043000000.0,401758700000.0,...,121552800000.0,2425798000000.0,14478064934000,31660910000.0,33689220000.0,592622500.0,106014700000.0,25130270000.0,15328340000.0,9665793000.0


In [5]:
# read in the Life Expectancy data
le_df = pd.read_csv('../Clean_Data/transformed/le_clean-transform.csv')
le_df

Unnamed: 0,Year,Albania,Algeria,Angola,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,Austria,...,United Kingdom,United States of America,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,2000,73.96,70.64,46.52,73.94,73.58,71.41,73.79,79.23,78.13,...,77.74,76.64,74.73,67.16,67.36,72.11,73.03,60.68,44.0,44.65
1,2001,74.29,71.12,47.06,74.17,73.76,71.8,73.85,79.63,78.58,...,77.99,76.84,74.94,67.37,67.58,72.29,73.23,61.22,44.62,44.01
2,2002,74.58,71.61,47.7,74.4,73.93,72.11,73.94,79.94,78.68,...,78.14,76.94,75.15,67.61,67.79,72.47,73.44,61.78,45.4,43.52
3,2003,74.83,72.1,48.44,74.61,74.11,72.35,74.04,80.24,78.63,...,78.45,77.04,75.37,67.87,67.99,72.62,73.65,62.36,46.32,43.2
4,2004,75.04,72.59,49.26,74.82,74.28,72.51,74.16,80.49,79.18,...,78.75,77.49,75.6,68.14,68.18,72.75,73.88,62.93,47.35,43.07
5,2005,75.23,73.07,50.17,75.02,74.45,72.63,74.29,80.84,79.33,...,79.05,77.49,75.83,68.41,68.35,72.85,74.09,63.48,48.5,43.24
6,2006,75.42,73.52,51.14,75.2,74.62,72.72,74.43,81.04,79.88,...,79.25,77.69,76.05,68.68,68.52,72.94,74.3,64.0,49.76,43.85
7,2007,75.65,73.94,52.18,75.38,74.79,72.82,74.58,81.29,80.18,...,79.45,77.99,76.25,68.94,68.67,73.01,74.47,64.47,51.13,44.95
8,2008,75.91,74.31,53.24,75.54,74.95,72.95,74.73,81.4,80.43,...,79.6,78.04,76.44,69.19,68.82,73.07,74.63,64.89,52.61,46.5
9,2009,76.22,74.64,54.31,75.68,75.12,73.12,74.87,81.54,80.33,...,80.05,78.39,76.6,69.44,68.97,73.12,74.75,65.26,54.13,48.45


In [6]:
# Drop the country column to align
le_df = le_df.drop(['Year','Slovakia','Venezuela'], axis=1)

# Add a suffix to the column names
le_df = le_df.add_suffix('_LEx')
le_df

Unnamed: 0,Albania_LEx,Algeria_LEx,Angola_LEx,Antigua and Barbuda_LEx,Argentina_LEx,Armenia_LEx,Aruba_LEx,Australia_LEx,Austria_LEx,Azerbaijan_LEx,...,Ukraine_LEx,United Kingdom_LEx,United States of America_LEx,Uruguay_LEx,Uzbekistan_LEx,Vanuatu_LEx,Vietnam_LEx,Yemen_LEx,Zambia_LEx,Zimbabwe_LEx
0,73.96,70.64,46.52,73.94,73.58,71.41,73.79,79.23,78.13,66.76,...,67.68,77.74,76.64,74.73,67.16,67.36,73.03,60.68,44.0,44.65
1,74.29,71.12,47.06,74.17,73.76,71.8,73.85,79.63,78.58,67.05,...,67.84,77.99,76.84,74.94,67.37,67.58,73.23,61.22,44.62,44.01
2,74.58,71.61,47.7,74.4,73.93,72.11,73.94,79.94,78.68,67.39,...,68.28,78.14,76.94,75.15,67.61,67.79,73.44,61.78,45.4,43.52
3,74.83,72.1,48.44,74.61,74.11,72.35,74.04,80.24,78.63,67.79,...,68.21,78.45,77.04,75.37,67.87,67.99,73.65,62.36,46.32,43.2
4,75.04,72.59,49.26,74.82,74.28,72.51,74.16,80.49,79.18,68.25,...,68.19,78.75,77.49,75.6,68.14,68.18,73.88,62.93,47.35,43.07
5,75.23,73.07,50.17,75.02,74.45,72.63,74.29,80.84,79.33,68.75,...,67.96,79.05,77.49,75.83,68.41,68.35,74.09,63.48,48.5,43.24
6,75.42,73.52,51.14,75.2,74.62,72.72,74.43,81.04,79.88,69.26,...,68.08,79.25,77.69,76.05,68.68,68.52,74.3,64.0,49.76,43.85
7,75.65,73.94,52.18,75.38,74.79,72.82,74.58,81.29,80.18,69.75,...,68.22,79.45,77.99,76.25,68.94,68.67,74.47,64.47,51.13,44.95
8,75.91,74.31,53.24,75.54,74.95,72.95,74.73,81.4,80.43,70.2,...,68.25,79.6,78.04,76.44,69.19,68.82,74.63,64.89,52.61,46.5
9,76.22,74.64,54.31,75.68,75.12,73.12,74.87,81.54,80.33,70.6,...,69.19,80.05,78.39,76.6,69.44,68.97,74.75,65.26,54.13,48.45


In [7]:
le_gdp_df = le_df.merge(gdp_df,left_index=True, right_index=True)

In [13]:
# Create a FeatureHasher instance
fh = FeatureHasher(input_type='string')

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(fh.fit_transform(systems_df))

encode_df.head()

Unnamed: 0,0
0,"(0, 98813)\t-1.0\n (0, 196981)\t-1.0\n (0,..."
1,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
2,"(0, 196981)\t-1.0\n (0, 226289)\t2.0\n (0,..."
3,"(0, 196981)\t-1.0\n (0, 296051)\t1.0\n (0,..."
4,"(0, 15072)\t2.0\n (0, 98813)\t-1.0\n (0, 1..."


In [14]:
le_gdp_css_df = le_gdp_df.merge(encode_df,left_index=True, right_index=True)
le_gdp_css_df.dropna(how='all')

# Writing out a CSV allows us to manually review the data
le_gdp_css_df.to_csv('../Clean_Data/le_gdp_css_df.csv', index=False)

le_gdp_css_df

Unnamed: 0,Albania_LEx,Algeria_LEx,Angola_LEx,Antigua and Barbuda_LEx,Argentina_LEx,Armenia_LEx,Aruba_LEx,Australia_LEx,Austria_LEx,Azerbaijan_LEx,...,United Kingdom_GDP,United States of America_GDP,Uruguay_GDP,Uzbekistan_GDP,Vanuatu_GDP,Vietnam_GDP,Yemen_GDP,Zambia_GDP,Zimbabwe_GDP,0
0,73.96,70.64,46.52,73.94,73.58,71.41,73.79,79.23,78.13,66.76,...,1662127000000.0,10250947997000,22823260000.0,13760510000.0,272014700.0,31172520000.0,9652436000.0,3600683000.0,6689958000.0,"(0, 98813)\t-1.0\n (0, 196981)\t-1.0\n (0,..."
1,74.29,71.12,47.06,74.17,73.76,71.8,73.85,79.63,78.58,67.05,...,1643908000000.0,10581929774000,20898790000.0,11401420000.0,257926900.0,32685200000.0,9861560000.0,4094481000.0,6777385000.0,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
2,74.58,71.61,47.7,74.4,73.93,72.11,73.94,79.94,78.68,67.39,...,1784077000000.0,10929112955000,13606490000.0,9687789000.0,262596600.0,35064110000.0,10694630000.0,4193846000.0,6342116000.0,"(0, 196981)\t-1.0\n (0, 226289)\t2.0\n (0,..."
3,74.83,72.1,48.44,74.61,74.11,72.35,74.04,80.24,78.63,67.79,...,2057094000000.0,11456442041000,12045630000.0,10134450000.0,314471300.0,39552510000.0,11777970000.0,4901840000.0,5727592000.0,"(0, 196981)\t-1.0\n (0, 296051)\t1.0\n (0,..."
4,75.04,72.59,49.26,74.82,74.28,72.51,74.16,80.49,79.18,68.25,...,2421814000000.0,12217193198000,13686330000.0,12030020000.0,364996900.0,45427850000.0,13872790000.0,6221078000.0,5805598000.0,"(0, 15072)\t2.0\n (0, 98813)\t-1.0\n (0, 1..."
5,75.23,73.07,50.17,75.02,74.45,72.63,74.29,80.84,79.33,68.75,...,2544829000000.0,13039199193000,17362860000.0,14307510000.0,394962600.0,57633260000.0,16746340000.0,8331870000.0,5755215000.0,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
6,75.42,73.52,51.14,75.2,74.62,72.72,74.43,81.04,79.88,69.26,...,2717060000000.0,13815586948000,19579460000.0,17330830000.0,439376800.0,66371660000.0,19061980000.0,12756860000.0,5443896000.0,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
7,75.65,73.94,52.18,75.38,74.79,72.82,74.58,81.29,80.18,69.75,...,3106182000000.0,14474226905000,23410570000.0,22311390000.0,516392900.0,77414430000.0,21650530000.0,14056960000.0,5291950000.0,"(0, 15072)\t1.0\n (0, 98813)\t-1.0\n (0, 1..."
8,75.91,74.31,53.24,75.54,74.95,72.95,74.73,81.4,80.43,70.2,...,2938882000000.0,14769857911000,30366210000.0,29549440000.0,590748200.0,99130300000.0,26910850000.0,17910860000.0,4415703000.0,"(0, 15072)\t1.0\n (0, 196981)\t-1.0\n (0, ..."
9,76.22,74.64,54.31,75.68,75.12,73.12,74.87,81.54,80.33,70.6,...,2425798000000.0,14478064934000,31660910000.0,33689220000.0,592622500.0,106014700000.0,25130270000.0,15328340000.0,9665793000.0,"(0, 15072)\t1.0\n (0, 196981)\t-1.0\n (0, ..."


In [15]:
# manually clean the dataframe
# Needed to manually remove the rows for 2000 and 2021 - due to missing values.
le_gdp_css_df = pd.read_csv('../Clean_Data/le_gdp_css_df.csv')
le_gdp_css_df

Unnamed: 0,Albania_LEx,Algeria_LEx,Angola_LEx,Antigua and Barbuda_LEx,Argentina_LEx,Armenia_LEx,Aruba_LEx,Australia_LEx,Austria_LEx,Azerbaijan_LEx,...,United Kingdom_GDP,United States of America_GDP,Uruguay_GDP,Uzbekistan_GDP,Vanuatu_GDP,Vietnam_GDP,Yemen_GDP,Zambia_GDP,Zimbabwe_GDP,SSS
0,74.29,71.12,47.06,74.17,73.76,71.8,73.85,79.63,78.58,67.05,...,1643908000000.0,10581929774000,20898790000.0,11401420000.0,257926900.0,32685200000.0,9861560000.0,4094481000.0,6777385000.0,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
1,74.58,71.61,47.7,74.4,73.93,72.11,73.94,79.94,78.68,67.39,...,1784077000000.0,10929112955000,13606490000.0,9687789000.0,262596600.0,35064110000.0,10694630000.0,4193846000.0,6342116000.0,"(0, 196981)\t-1.0\n (0, 226289)\t2.0\n (0,..."
2,74.83,72.1,48.44,74.61,74.11,72.35,74.04,80.24,78.63,67.79,...,2057094000000.0,11456442041000,12045630000.0,10134450000.0,314471300.0,39552510000.0,11777970000.0,4901840000.0,5727592000.0,"(0, 196981)\t-1.0\n (0, 296051)\t1.0\n (0,..."
3,75.04,72.59,49.26,74.82,74.28,72.51,74.16,80.49,79.18,68.25,...,2421814000000.0,12217193198000,13686330000.0,12030020000.0,364996900.0,45427850000.0,13872790000.0,6221078000.0,5805598000.0,"(0, 15072)\t2.0\n (0, 98813)\t-1.0\n (0, 1..."
4,75.23,73.07,50.17,75.02,74.45,72.63,74.29,80.84,79.33,68.75,...,2544829000000.0,13039199193000,17362860000.0,14307510000.0,394962600.0,57633260000.0,16746340000.0,8331870000.0,5755215000.0,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
5,75.42,73.52,51.14,75.2,74.62,72.72,74.43,81.04,79.88,69.26,...,2717060000000.0,13815586948000,19579460000.0,17330830000.0,439376800.0,66371660000.0,19061980000.0,12756860000.0,5443896000.0,"(0, 196981)\t-1.0\n (0, 226289)\t1.0\n (0,..."
6,75.65,73.94,52.18,75.38,74.79,72.82,74.58,81.29,80.18,69.75,...,3106182000000.0,14474226905000,23410570000.0,22311390000.0,516392900.0,77414430000.0,21650530000.0,14056960000.0,5291950000.0,"(0, 15072)\t1.0\n (0, 98813)\t-1.0\n (0, 1..."
7,75.91,74.31,53.24,75.54,74.95,72.95,74.73,81.4,80.43,70.2,...,2938882000000.0,14769857911000,30366210000.0,29549440000.0,590748200.0,99130300000.0,26910850000.0,17910860000.0,4415703000.0,"(0, 15072)\t1.0\n (0, 196981)\t-1.0\n (0, ..."
8,76.22,74.64,54.31,75.68,75.12,73.12,74.87,81.54,80.33,70.6,...,2425798000000.0,14478064934000,31660910000.0,33689220000.0,592622500.0,106014700000.0,25130270000.0,15328340000.0,9665793000.0,"(0, 15072)\t1.0\n (0, 196981)\t-1.0\n (0, ..."
9,76.56,74.94,55.35,75.82,75.28,73.33,75.02,81.7,80.58,70.94,...,2491110000000.0,15048964444000,40284480000.0,49765680000.0,670713200.0,147198900000.0,30906750000.0,20265560000.0,12041660000.0,"(0, 98813)\t-1.0\n (0, 196981)\t-1.0\n (0,..."


In [16]:
# Standardize the data with StandardScaler().
scaler = StandardScaler()

le_gdp_css_scaled_nda = scaler.fit_transform(le_gdp_css_df)

le_gdp_css_scaled_nda

ValueError: could not convert string to float: '  (0, 196981)\t-1.0\n  (0, 226289)\t1.0\n  (0, 334826)\t-1.0\n  (0, 354738)\t1.0\n  (0, 364501)\t-1.0\n  (0, 386348)\t-2.0\n  (0, 715523)\t1.0\n  (0, 803687)\t1.0\n  (0, 849870)\t1.0\n  (0, 879148)\t1.0'

In [None]:
# Using PCA to reduce dimension to three principal components.

pca = PCA(n_components=3)

# Get three principal components for the demographics data

demo_pca = pca.fit_transform(le_gdp_css_scaled_nda)

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=demo_pca, columns=['pc1', 'pc2', 'pc3'], index=le_gdp_css_df.index)
pcs_df

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
k= list(range(1,11))

# Calculate the inertia for a range of k values

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Create the elbow curve
elbow_data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)

elbow_df.hvplot(x='k', y='inertia', xticks=k, title='Elbow Curve')


In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pcs_df)

# Make predictions
pred = model.predict(pcs_df)

# Add the predicted class columns
pcs_df['class'] = model.labels_
pcs_df.head()

In [None]:
# Create a new DataFrame including predicted clusters and demographic features.
# Concatentate the crypto_df and pcs_df DataFrames.
frames = [le_gdp_css_df, pcs_df]
clustered_df = pd.concat(frames, axis=1, join='outer')
clustered_df.index = le_gdp_css_df.index

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters

fig = px.scatter_3d(clustered_df, x='pc1', y='pc2', z='pc3', color='class', symbol='class', width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [None]:
clustered_df.to_csv('clustered_le_gdp_css_df.csv', index=False)