In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import euclidean_distances
from sklearn import preprocessing

In [2]:
region_list = ['United States', 'United Kingdom', 'Germany', 'France', 'Australia',
               'South Africa', 'India', 'Chile','Mexico', 'Spain',
               'Brazil', 'Portugal', 'Egypt', 'Qatar',
               'Russia', 'Israel', 'Japan', 'China', 'Hong Kong'] 

In [3]:
def fix_regions(df):
    regions = df.index
    regions = [i.replace('Great Britain','United Kingdom') for i in regions]
    regions = [i.replace('Hong Kong SAR','Hong Kong') for i in regions]
    regions = [i.replace('Hong Kong S.A.R. of China','Hong Kong') for i in regions]
    regions = [i.replace('Russian Federation','Russia') for i in regions]
    
    df.index = regions
    return df

# Individual level

## Life expectancy  
Missing data: no   
Year: 2021

In [25]:
life_expectancy = pd.read_csv('dataset/raw_data/life-expectancy.csv',index_col=0)
life_expectancy = life_expectancy.loc[region_list,]
life_expectancy = life_expectancy.loc[life_expectancy.Year == 2021,]

life_expectancy = life_expectancy.rename(columns = {'Life expectancy at birth (historical)':'life_expectancy'})
life_expectancy_dist = pd.DataFrame(euclidean_distances(life_expectancy[['life_expectancy']]),
                            index=life_expectancy.index,columns=life_expectancy.index)
life_expectancy_dist.to_csv('rdm/checked/life_expectancy_dist.csv')

set(region_list) - set(life_expectancy.index)

set()

## SWB  
Missing data: Qatar   
Year: 2022

In [91]:
SWB = pd.read_excel('dataset/raw_data/Appendix_2_Data_for_Figure_2.1.xls',index_col = 0)
SWB['Country'] = SWB['Country'].replace({'Hong Kong S.A.R. of China':'Hong Kong'})
regions = [i for i in region_list if i not in ['Qatar']]
SWB = SWB.loc[SWB['Country'].isin(regions),]

SWB_dist = pd.DataFrame(euclidean_distances(SWB[['Happiness score']]),
                            index=SWB['Country'],columns=SWB['Country'])
SWB_dist.to_csv('rdm/checked/SWB_dist.csv')
set(region_list)-set(SWB['Country'])

{'Qatar'}

## Positive affect
Missing data: Qatar  
Year: 2021

In [93]:
positive_affect = pd.read_excel('dataset/raw_data/DataForTable2.1.xls',
                                   usecols=['Country name','year','Positive affect'])
positive_affect.index = positive_affect['Country name']
positive_affect = fix_regions(positive_affect)

positive_affect = positive_affect.dropna()
positive_affect = positive_affect[positive_affect.year==2021]
positive_affect_dist = pd.DataFrame(\
                euclidean_distances(positive_affect[['Positive affect']]),
                index=positive_affect.index,columns=positive_affect.index)
positive_affect_dist.to_csv('rdm/checked/positive_affect_dist.csv')
set(region_list)-set(positive_affect.index)

{'Qatar'}

## IQ  
Missing data: no    
Year: 2023

In [101]:
IQ = pd.read_csv('dataset/raw_data/IQ_raw.csv',index_col=0)
IQ = IQ.loc[IQ.country.isin(region_list),]
IQ_dist = pd.DataFrame(euclidean_distances(IQ[['iq']]),
                index=IQ['country'],columns=IQ['country'])
IQ_dist.to_csv('rdm/checked/IQ_dist.csv')
set(region_list)-set(IQ.country)

set()

## Early childhood development (ECD)

In [131]:
child_develop = pd.read_excel('dataset/raw_data/XLS_Early-childhood-development-database-May-2022.xlsx',
                              index_col=0,header = 7)
child_develop = child_develop[~child_develop.index.isna()]
child_develop = fix_regions(child_develop)

child_develop = child_develop.loc[child_develop.index.isin(region_list),]
set(region_list)-set(child_develop.index)

{'Hong Kong'}

In [132]:
child_develop

Unnamed: 0,Children developmentally on track (%)\n(2013-2021)*,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Source,Unnamed: 8,Children on track in development status measured by ECDI (%)\n(2013-2021)*,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Source.1
Australia,-,,-,,-,,,,,,,,,,
Brazil,-,,-,,-,,,,,,,,,,
Chile,-,,-,,-,,,,,,,,,,
China,-,,-,,-,,,,,,,,,,
Egypt,-,,-,,-,,,,,,,,,,
France,-,,-,,-,,,,,,,,,,
Germany,-,,-,,-,,,,,,,,,,
India,-,,-,,-,,,,,,,,,,
Israel,-,,-,,-,,,,,,,,,,
Japan,-,,-,,-,,,,,,,,,,


## Youth suicide rate

In [4]:
SCDI = pd.read_excel('dataset/raw_data/sustainability-10-01563-s001.xlsx',
                     sheet_name='Table S3',header=3)
SCDI = SCDI.rename(columns={'Unnamed: 0':'Country'})
SCDI['Country'] = SCDI['Country'].replace({'Russian Federation':'Russia'})
SCDI = SCDI.loc[~SCDI.Country.isna(),]
SCDI = SCDI.loc[SCDI.Country.isin(region_list),]

young_suicide = SCDI[['Country','Suicide rate, 15-29 year-olds, per 100000']]
young_suicide_dist = pd.DataFrame(euclidean_distances(young_suicide[['Suicide rate, 15-29 year-olds, per 100000']]),
                index=young_suicide['Country'],columns=young_suicide['Country'])
young_suicide_dist.to_csv('rdm/checked/young_suicide_dist.csv')
set(region_list)-set(SCDI.Country)

{'Hong Kong'}

# Interpersonal level

## Marriage rate   
Missing data: India  
Year: 2005  
Data is missing in many countries, and 2005 is the year with the most complete data available for as many countries as possible.

In [51]:
marriage = pd.read_csv('dataset/OurData/marriage-rate-per-1000-inhabitants.csv',index_col = 0)
regions = [i for i in region_list if i not in ['India']]
marriage = marriage.loc[regions,]
#marriage.dropna()
#print('The max year: {}'.format(marriage.Year.max()))
marriage = marriage[marriage.Year==2005]
marriage = marriage.rename(columns = {'Crude marriage rate (per 1,000 inhabitants)':'marriage'})
marriage_dist = pd.DataFrame(euclidean_distances(marriage[['marriage']]),
                            index=marriage.index,columns=marriage.index)
marriage_dist.to_csv('rdm/checked/marriage_dist.csv')
set(region_list)-set(marriage.index)

{'India'}

## Divorce rate   
Missing data: Chile, India  
Year: 2005  
Data is missing in many countries, and 2005 is the year with the most complete data available for as many countries as possible.

In [52]:
divorce = pd.read_csv('dataset/OurData/divorces-per-1000-people.csv',index_col = 0)
regions = [i for i in region_list if i not in ['India']]
divorce = divorce.loc[regions,]
#divorce.dropna()
#print('The max year: {}'.format(divorce.Year.max()))
divorce = divorce[divorce.Year==2005]
divorce = divorce.rename(columns = {'Crude divorce rate (per 1,000 inhabitants)':'divorce'})
divorce_dist = pd.DataFrame(euclidean_distances(divorce[['divorce']]),
                            index=divorce.index,columns=divorce.index)
divorce_dist.to_csv('rdm/checked/divorce_dist.csv')
set(region_list)-set(divorce.index)

{'Chile', 'India'}

## Trust your family  
Missing data: Israel, France   
Year: mean(2010-2022)   
The collection years vary among different countries, so we selected a year range that covers as many countries as possible. If a country or region has multiple data collection points within the selected year range, the data will be averaged.

In [72]:
trust_family_others = pd.read_csv('dataset/WVS/WVS_final_mean.csv',index_col = 0)
trust_family_others = trust_family_others[['Year','How much do you trust your family (4-point scale)']]
trust_family_others = fix_regions(trust_family_others)
trust_family_others.dropna(inplace=True)
regions = [i for i in region_list if i not in ['Portugal','Israel']]
trust_family_others = trust_family_others.loc[regions,]
#print('The max year: {}'.format(trust_family_others.Year.max()))

trust_family_others = trust_family_others[(trust_family_others.Year>2009)&\
                                         (trust_family_others.Year<2023)] # mean of 2010~2020

trust_family_others['Country'] = trust_family_others.index
trust_family_others = trust_family_others.rename(columns = {'How much do you trust your family (4-point scale)':'Trust'})
#len(trust_family_others.index.unique())
trust_family_others = pd.DataFrame(trust_family_others.groupby('Country').Trust.mean())

# Portugal: 2014
trust_family_portugal = pd.read_csv('dataset/WVS/Portugal_final_mean.csv',index_col = 0)
trust_family_portugal = trust_family_portugal.rename(columns = {'Trust: Your family (B)':'Trust'})
trust_family_portugal = trust_family_portugal[['Trust']]
trust_family = pd.concat([trust_family_others,trust_family_portugal])

trust_family_dist = pd.DataFrame(euclidean_distances(trust_family[['Trust']]),
                            index=trust_family.index,columns=trust_family.index)
trust_family_dist.to_csv('rdm/checked/trust_family_dist.csv')
set(region_list)-set(trust_family.index)

{'France', 'Israel'}

## Trust your neighborhood  
Missing data: Israel, France   
Year: mean(2010-2022)   
The collection years vary among different countries, so we selected a year range that covers as many countries as possible. If a country or region has multiple data collection points within the selected year range, the data will be averaged.

In [77]:
trust_neighborhood_others = pd.read_csv('dataset/WVS/WVS_final_mean.csv',index_col = 0)
trust_neighborhood_others = trust_neighborhood_others[['Year','Trust: Your neighborhood (B)']]
trust_neighborhood_others = fix_regions(trust_neighborhood_others)
trust_neighborhood_others.dropna(inplace=True)

regions = [i for i in region_list if i not in ['Portugal','Israel']]
trust_neighborhood_others = trust_neighborhood_others.loc[regions,]
#print('The max year: {}'.format(trust_neighborhood_others.Year.max()))

trust_neighborhood_others = trust_neighborhood_others[(trust_neighborhood_others.Year>2009)&\
                                         (trust_neighborhood_others.Year<2023)] # mean of 2010~2022
trust_neighborhood_others['Country'] = trust_neighborhood_others.index
trust_neighborhood_others = trust_neighborhood_others.rename(columns = {'Trust: Your neighborhood (B)':'Trust'})
trust_neighborhood_others = pd.DataFrame(trust_neighborhood_others.groupby('Country').Trust.mean())

# Portugal: 2014
trust_neighborhood_portugal = pd.read_csv('dataset/WVS/Portugal_final_mean.csv',index_col = 0)
trust_neighborhood_portugal = trust_neighborhood_portugal.rename(columns = {'Trust: Your neighborhood (B)':'Trust'})
trust_neighborhood_portugal = trust_neighborhood_portugal[['Trust']]
trust_neighborhood = pd.concat([trust_neighborhood_others,trust_neighborhood_portugal])

trust_neighborhood_dist = pd.DataFrame(euclidean_distances(trust_neighborhood[['Trust']]),
                            index=trust_neighborhood.index,columns=trust_neighborhood.index)
trust_neighborhood_dist.to_csv('rdm/checked/trust_neighborhood_dist.csv')
set(region_list)-set(trust_neighborhood.index)

{'France', 'Israel'}

## Social support 
Missing data: Qatar     
Year: 2021

In [94]:
social_support = pd.read_excel('dataset/raw_data/DataForTable2.1.xls',
                                   usecols=['Country name','year','Social support'])
social_support.index = social_support['Country name']
social_support = fix_regions(social_support)

social_support = social_support.dropna()
social_support = social_support[social_support.year==2021]
social_support_dist = pd.DataFrame(\
                euclidean_distances(social_support[['Social support']]),
                index=social_support.index,columns=social_support.index)
social_support_dist.to_csv('rdm/checked/social_support_dist.csv')
set(region_list)-set(social_support.index)

{'Qatar'}

## Intergroup differentiation
Missing data: no     
Year: 2020

In [103]:
ECD = pd.read_csv('dataset/ECD/ECD Data v3.csv',index_col=0)
prejudice = ECD.loc[ECD['Country'].isin(region_list),['Country']+['prejudice']]
prejudice.drop_duplicates(inplace=True) 
prejudice_dist = pd.DataFrame(euclidean_distances(prejudice[['prejudice']]),
                            index=prejudice['Country'],columns=prejudice['Country'])
prejudice_dist.to_csv('rdm/checked/prejudice_ECD_dist.csv')
set(region_list)-set(prejudice['Country'])

set()

# Societal level   


## HDI
Missing data: no  
Year: 2021

In [5]:
HDI = pd.read_excel('dataset/raw_data/HDR21-22_Statistical_Annex_HDI_Table.xlsx',
                    header=4,index_col = 0)
HDI = HDI.rename(columns={'Unnamed: 1':'Country'})

# HDI.loc[HDI['Country']=='Hong Kong, China (SAR)','Country'] = 'Hong Kong'
# HDI.loc[HDI['Country']=='Russian Federation','Country'] = 'Russia'

HDI['Country'] = HDI['Country'].replace({'Hong Kong, China (SAR)':'Hong Kong',
                                        'Russian Federation':'Russia'})
HDI = HDI.loc[HDI['Country'].isin(region_list),]
missing_regions = set(region_list) - set(HDI['Country'])
print(f'Missing data: {missing_regions}')


HDI_dist = pd.DataFrame(euclidean_distances(HDI[['Human Development Index (HDI) ']]),
                            index=HDI['Country'],columns=HDI['Country'])
HDI_dist.to_csv('rdm/checked/HDI_dist.csv')

Missing data: set()


## Gender Inequality Index
Missing data: Hong Kong   
Year: 2021

In [67]:
Gender_equal = pd.read_excel('dataset/raw_data/HDR21-22_Statistical_Annex_GII_Table.xlsx',
                    header=3,index_col = 0)
Gender_equal = Gender_equal.rename(columns={'Unnamed: 1':'Country'})


Gender_equal['Country'] = Gender_equal['Country'].replace({'Hong Kong, China (SAR)':'Hong Kong',
                                        'Russian Federation':'Russia'})
regions = [i for i in region_list if i not in ['Hong Kong']]
Gender_equal = Gender_equal.loc[Gender_equal['Country'].isin(regions),]
missing_regions = set(region_list) - set(Gender_equal['Country'])
print(f'Missing data: {missing_regions}')


Gender_equal_dist = pd.DataFrame(euclidean_distances(Gender_equal[['Gender Inequality Index']]),
                            index=Gender_equal['Country'],columns=Gender_equal['Country'])
Gender_equal_dist.to_csv('rdm/checked/Gender_equal_dist.csv')

Missing data: {'Hong Kong'}


## Democracy  
Missing data: no   
Year: 2019

In [26]:
ECD = pd.read_csv('dataset/ECD/ECD Data v3.csv',index_col=0)
democracy = ECD.loc[ECD['Country'].isin(region_list),['Country']+['democracy']]
democracy.drop_duplicates(inplace=True) 
democracy = democracy.dropna()
democracy_dist = pd.DataFrame(euclidean_distances(democracy[['democracy']]),
                            index=democracy['Country'],columns=democracy['Country'])

democracy_dist.to_csv('rdm/checked/democracy_dist.csv')
set(region_list)-set(democracy['Country'])

set()

## Corruption  
Missing data: no   
Year: 2022

In [29]:
corruption = pd.read_csv('dataset/raw_data/corruption perceptions index.csv',index_col=0)

corruption_dist = pd.DataFrame(euclidean_distances(corruption[['CPI']]),
                            index=corruption.index,columns=corruption.index)

corruption_dist.to_csv('rdm/checked/corruption_dist.csv')
set(region_list)-set(corruption.index)

set()

## Confidence in national government
Missing data: China, Qatar  
Year: 2021

In [6]:
Institutional_trust = pd.read_excel('dataset/raw_data/DataForTable2.1.xls',
                                   usecols=['Country name','year','Confidence in national government'])
Institutional_trust.index = Institutional_trust['Country name']
Institutional_trust = fix_regions(Institutional_trust)

Institutional_trust = Institutional_trust.dropna()
Institutional_trust = Institutional_trust[Institutional_trust.year==2021]
Institutional_trust_dist = pd.DataFrame(\
                euclidean_distances(Institutional_trust[['Confidence in national government']]),
                index=Institutional_trust.index,columns=Institutional_trust.index)
Institutional_trust_dist.to_csv('rdm/checked/Institutional_trust_dist.csv')
set(region_list)-set(Institutional_trust.index)

{'China', 'Qatar'}

## SCDI  
Missing data: Hong Kong  
Year: 2015

In [5]:
SCDI = pd.read_excel('dataset/raw_data/sustainability-10-01563-s001.xlsx',
                     sheet_name='Table S3',header=3)
SCDI = SCDI.rename(columns={'Unnamed: 0':'Country'})
SCDI['Country'] = SCDI['Country'].replace({'Russian Federation':'Russia'})
SCDI = SCDI.loc[~SCDI.Country.isna(),]
SCDI = SCDI.loc[SCDI.Country.isin(region_list),]

Sustained_CDI = SCDI[['Country','SCDI score']]
Sustained_CDI_dist = pd.DataFrame(euclidean_distances(Sustained_CDI[['SCDI score']]),
                index=Sustained_CDI['Country'],columns=Sustained_CDI['Country'])
Sustained_CDI_dist.to_csv('rdm/checked/Sustained_CDI_dist.csv')
set(region_list)-set(SCDI.Country)

{'Hong Kong'}