In [16]:
%pip install thefuzz
%pip install python-levenshtein
%pip install vega-datasets

# IMPORTANT always restart from here!
import pandas as pd 
import numpy as np
import altair as alt
from thefuzz import fuzz

# parameters

countriesOfInterest = ['United States of America', 'Germany', 'Russian Federation', 'Belarus', 'China', 'Colombia', 'Brazil', 'Algeria', 'Egypt', 'South Africa']

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting vega-datasets
  Using cached vega_datasets-0.9.0-py3-none-any.whl (210 kB)
Installing collected packages: vega-datasets
Successfully installed vega-datasets-0.9.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# I. Prepare CPI dataset

cpi = pd.read_csv('../0_datasets/cpi.csv', sep=',', na_values=['-'])
cpi.head()

Unnamed: 0,Jurisdiction,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Afghanistan,,,,,,,,2.5,,1.8,1.5,1.3,1.4,1.52,8.0,8.0,12.0,11.0
1,Albania,,2.3,,,2.5,2.5,2.5,2.4,2.6,2.9,3.4,3.2,3.3,3.05,33.0,31.0,33.0,36.0
2,Algeria,,,,,,2.6,2.7,2.8,3.1,3.0,3.2,2.8,2.9,2.9,34.0,36.0,36.0,36.0
3,Angola,,,1.7,,1.7,1.8,2.0,2.0,2.2,2.2,1.9,1.9,1.9,2.01,22.0,23.0,19.0,15.0
4,Argentina,3.0,3.0,3.5,3.5,2.8,2.5,2.5,2.8,2.9,2.9,2.9,2.9,2.9,3.0,35.0,34.0,34.0,32.0


In [92]:
# A) Normalize values between (1..0), rename 'Jurisdiction' to 'Country'
# From 2012 onwards, the methology changed (scala went from 10-0 to 100-0)
years100 = ['2012', '2013', '2014', '2015']
cpi_norm = cpi.drop('Jurisdiction', axis=1)
cpi_norm[years100] = cpi_norm[years100].applymap(lambda v: v / 10.0)
cpi_norm /= 10.0
cpi_norm['Country'] = cpi['Jurisdiction']

# B) Long Format
# Long format is more desireable, as we want to join it with WVS later on
cpi_norm = cpi_norm.melt(id_vars=['Country'], var_name='Year', value_name='Value')
cpi_norm['Country'] = cpi_norm['Country'].replace({'USA': 'United States of America', 'Russia': 'Russian Federation'})

# C) Filter countries and invert values from (1..0) to (0..1) [1 is best]
#cpi_norm = cpi_norm.loc[cpi_norm['Country'].isin(countriesOfInterest)]
cpi_norm['Value'] = 1 - cpi_norm['Value']
 
# D) add constant Variables column
cpi_norm['Variable'] = 'Corruption Perception Index'

cpi_norm

Unnamed: 0,Country,Year,Value,Variable
0,Afghanistan,1998,,Corruption Perception Index
1,Albania,1998,,Corruption Perception Index
2,Algeria,1998,,Corruption Perception Index
3,Angola,1998,,Corruption Perception Index
4,Argentina,1998,0.70,Corruption Perception Index
...,...,...,...,...
4081,Viet Nam,2015,,Corruption Perception Index
4082,Yemen,2015,0.82,Corruption Perception Index
4083,Yugoslavia,2015,,Corruption Perception Index
4084,Zambia,2015,0.62,Corruption Perception Index


In [4]:
# II. Prepare WVS-Variables dataset

wvs_vars = pd.read_csv('../0_datasets/wvs-variables.csv')
wvs_vars.head()

Unnamed: 0,Variable,Title,WVS7,WVS6,WVS5,WVS4,WVS3,WVS2,WVS1
0,A001,Important in life: Family,Q1,V4,V4,V4,V4,V5,
1,A002,Important in life: Friends,Q2,V5,V5,V5,V5,V6,
2,A003,Important in life: Leisure time,Q3,V6,V6,V6,V6,V7,
3,A004,Important in life: Politics,Q4,V7,V7,V7,V7,V8,
4,A005,Important in life: Work,Q5,V8,V8,V8,V8,V4,


In [5]:
# A) Remove unwanted prefixes 
# We want to use the Title as human-readable labels later on
wvs_vars['Title'].replace(r'[^ ]+-\ ', '', regex=True, inplace=True)

# B) Drop unneeded internal variables
wvs_vars = wvs_vars.drop(wvs_vars.loc[:, 'WVS7':'WVS1'], axis=1)

# C) Set the variable name as key
wvs_vars.set_index('Variable', inplace=True)
wvs_vars.head()

Unnamed: 0_level_0,Title
Variable,Unnamed: 1_level_1
A001,Important in life: Family
A002,Important in life: Friends
A003,Important in life: Leisure time
A004,Important in life: Politics
A005,Important in life: Work


In [6]:
# D) some exploration: grouping variable names by levensthein distance

# code übernommen von https://stackoverflow.com/questions/35171710/how-to-group-words-whose-levenshtein-distance-is-more-than-80-percent-in-python

combined_list = wvs_vars['Title'].tolist()

grs = list() # groups of names with distance > 80
for name in combined_list:
    for g in grs:
        if all(fuzz.partial_ratio(name, w) > 55 for w in g):
            g.append(name)
            break
    else:
        grs.append([name, ])

for group in grs:
    print('-')
    for e in group:
        print('    ', wvs_vars.loc[wvs_vars['Title'] == e].index[0], e)

-
     A001 Important in life: Family
     A002 Important in life: Friends
     A003 Important in life: Leisure time
     A004 Important in life: Politics
     A005 Important in life: Work
     A006 Important in life: Religion
     A029 Important child qualities: independence
     A030 Important child qualities: Hard work
     A032 Important child qualities: feeling of responsibility
     A034 Important child qualities: imagination
     A035 Important child qualities: tolerance and respect for other people
     A038 Important child qualities: thrift saving money and things
     A039 Important child qualities: determination perseverance
     A040 Important child qualities: religious faith
     A041 Important child qualities: unselfishness
     A042 Important child qualities: obedience
     A043B Important child qualities: Self-expression
-
     A008 Feeling of happiness
     E036 Private vs state ownership of business
-
     A009 State of health (subjective)
     X045 Social class (subj

     F128 Justifiable: Adultery
     F129 Justifiable: Throwing away litter
     F132 Justifiable: Having casual sex
     F136 Justifiable: Political assassination
     F144_02 Justifiable: Death penalty
     F199 Justifiable: For a man to beat his wife
-
     F001 Thinking about meaning and purpose of life
-
     F004 Life is meaningful because God exits
     F007 Death has meaning if you believe in God
-
     F005 Try to get the best out of life
     F036 Churches give answers: the problems of family life
-
     F006 Death is inevitable
     F008 Death is a natural resting point
-
     F022 Statement: good and evil
     F048 Churches speak out on: ecology and environmental issues
-
     F024 Belong to religious denomination
     F025 Religious denominations - major groups
     F025_WVS Religious denomination - detailed list
     F027 Which former religious denomination
-
     F028 How often do you attend religious services
     F029 Raised religiously
     F031 Important: Religious s

In [7]:
# III. Prepare WVS dataset

# A) read country, year and selected variables in chunks

wvs_keepVars = ['A001', 'A002', 'A003', 'A004', 'A005', 'A006', 'A008', 'E114', 'E117']

wvs_countryVar = 'S003'
wvs_yearVar = 'S020'
wvs_countryYear = pd.DataFrame(columns = [wvs_countryVar, wvs_yearVar]).astype({(wvs_yearVar): np.uint16})
wvs_varData = pd.DataFrame(columns = wvs_keepVars).astype(np.float16)

chunksize = 2000
with pd.read_csv('../0_datasets/wvs.csv', sep=';', na_values=['', ' '], chunksize=chunksize) as reader:
    for chunk in reader:
        wvs_countryYear = pd.concat([wvs_countryYear, chunk[[wvs_countryVar, wvs_yearVar]]]).astype({(wvs_yearVar): np.uint16})
        wvs_varData = pd.concat([wvs_varData, chunk[wvs_keepVars]]).astype(np.float16)

wvs_varData

Unnamed: 0,A001,A002,A003,A004,A005,A006,A008,E114,E117
0,1.0,2.0,2.0,4.0,2.0,2.0,3.0,3.0,1.0
1,1.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,2.0
2,1.0,2.0,2.0,4.0,1.0,1.0,3.0,2.0,1.0
3,1.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0,1.0
4,1.0,2.0,2.0,4.0,1.0,1.0,3.0,-2.0,1.0
...,...,...,...,...,...,...,...,...,...
440050,1.0,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
440051,1.0,1.0,2.0,1.0,1.0,1.0,2.0,4.0,1.0
440052,1.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0
440053,1.0,2.0,2.0,1.0,1.0,1.0,4.0,1.0,2.0


In [8]:
# B) normalize data by min-max normalization

wvs_varData_norm = wvs_varData.copy()
wvs_varData_norm[wvs_varData.lt(0)] = np.nan # filter out (negative) marker values

for col in wvs_varData.columns:
    wvs_varData_norm[col] = (wvs_varData_norm[col] - wvs_varData_norm[col].min()) / (wvs_varData_norm[col].max() - wvs_varData_norm[col].min())
    wvs_varData_norm[col] = 1 - wvs_varData_norm[col]
    
# C) concat country+year+variable columns, also replace variable titles
wvs = pd.concat([wvs_countryYear, wvs_varData_norm], axis=1)

wvs.rename(columns=wvs_vars.loc[wvs.columns]['Title'], inplace=True)
wvs.rename(columns={'ISO 3166-1 numeric country code': 'Country', 'Year survey': 'Year'}, inplace=True)

wvs

Unnamed: 0,Country,Year,Important in life: Family,Important in life: Friends,Important in life: Leisure time,Important in life: Politics,Important in life: Work,Important in life: Religion,Feeling of happiness,Political system: Having a strong leader,Political system: Having a democratic political system
0,8,1998,1.0,0.666992,0.666992,0.000000,0.666992,0.666992,0.333496,0.333496,1.000000
1,8,1998,1.0,0.666992,0.666992,0.333496,0.666992,0.666992,0.666992,0.333496,0.666992
2,8,1998,1.0,0.666992,0.666992,0.000000,1.000000,1.000000,0.333496,0.666992,1.000000
3,8,1998,1.0,0.666992,0.666992,0.333496,0.666992,0.666992,0.333496,0.666992,1.000000
4,8,1998,1.0,0.666992,0.666992,0.000000,1.000000,1.000000,0.333496,,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
440050,716,2020,1.0,0.333496,1.000000,1.000000,1.000000,0.666992,0.666992,1.000000,1.000000
440051,716,2020,1.0,1.000000,0.666992,1.000000,1.000000,1.000000,0.666992,0.000000,1.000000
440052,716,2020,1.0,0.333496,0.666992,0.666992,0.666992,0.666992,0.333496,0.333496,1.000000
440053,716,2020,1.0,0.666992,0.666992,1.000000,1.000000,1.000000,0.000000,1.000000,0.666992


In [9]:
# D) convert from wide into long format

wvs = wvs.melt(id_vars=['Year', 'Country'], var_name='Variable', value_name='Value')

wvs.head()

Unnamed: 0,Year,Country,Variable,Value
0,1998,8,Important in life: Family,1.0
1,1998,8,Important in life: Family,1.0
2,1998,8,Important in life: Family,1.0
3,1998,8,Important in life: Family,1.0
4,1998,8,Important in life: Family,1.0


In [10]:
# E) summarize data by taking arithmetic mean

wvsSummary = wvs.groupby(['Year', 'Country', 'Variable']).mean().reset_index()
wvs = None # free memory
wvsSummary.head()

Unnamed: 0,Year,Country,Variable,Value
0,1981,36,Feeling of happiness,0.768555
1,1981,36,Important in life: Family,
2,1981,36,Important in life: Friends,
3,1981,36,Important in life: Leisure time,
4,1981,36,Important in life: Politics,


In [11]:
# IV. Prepare country-codes.csv

# A) read dataset
countryCodes = pd.read_csv('../0_datasets/country-codes.csv')
countryCodes.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [12]:
# B) set iso 3166-2 code as index to country name

countryCodes = countryCodes[['name', 'country-code']].set_index('country-code')
countryCodes.head()

Unnamed: 0_level_0,name
country-code,Unnamed: 1_level_1
4,Afghanistan
248,Åland Islands
8,Albania
12,Algeria
16,American Samoa


In [13]:
# C) replace country codes with country names in WVS Summary

wvsSummaryCC = wvsSummary.copy()
wvsSummaryCC['Country'] = countryCodes.loc[wvsSummary['Country']]['name'].values
wvsSummaryCC = wvsSummaryCC.loc[wvsSummaryCC['Country'].isin(countriesOfInterest)]
wvsSummaryCC = pd.concat([wvsSummaryCC, cpi_norm], axis=0)
wvsSummaryCC

Unnamed: 0,Year,Country,Variable,Value
63,1982,South Africa,Feeling of happiness,0.682617
64,1982,South Africa,Important in life: Family,
65,1982,South Africa,Important in life: Friends,
66,1982,South Africa,Important in life: Leisure time,
67,1982,South Africa,Important in life: Politics,
...,...,...,...,...
3928,2015,Egypt,Corruption Perception Index,0.360000
3941,2015,Germany,Corruption Perception Index,0.810000
4024,2015,Russian Federation,Corruption Perception Index,0.290000
4044,2015,South Africa,Corruption Perception Index,0.440000


In [14]:
# V. showcase

alt.data_transformers.enable('default', max_rows=None)

nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['Country', 'Value', 'Variable', 'Year'])

alt.Chart(wvsSummaryCC).mark_line().encode(
    alt.X('Year:O'),
    alt.Y('Value:Q', axis=alt.Axis(labelAngle=-45)),
    #alt.Color('Variable:N'),
    alt.Color('Country:N'),
    alt.Tooltip(['Country', 'Value', 'Variable', 'Year'])
).facet(
    #column='Country:N'
    facet='Variable:N',
    columns=2
).add_selection(
    nearest
).resolve_scale(x='independent', y='independent')

# TODO nicer tooltips:
# https://stackoverflow.com/questions/53287928/tooltips-in-altair-line-charts

In [94]:
cpi2001 = cpi_norm.loc[cpi_norm['Year'] == '2001']
cpi2001

countryCodes_noindex = countryCodes.reset_index().rename(columns={'name': 'Country'})

lookupData = pd.merge(cpi2001, countryCodes_noindex, on='Country')
lookupData

Unnamed: 0,Country,Year,Value,Variable,country-code
0,Afghanistan,2001,,Corruption Perception Index,4
1,Albania,2001,,Corruption Perception Index,8
2,Algeria,2001,,Corruption Perception Index,12
3,Angola,2001,,Corruption Perception Index,24
4,Argentina,2001,0.65,Corruption Perception Index,32
...,...,...,...,...,...
162,Vanuatu,2001,,Corruption Perception Index,548
163,Viet Nam,2001,,Corruption Perception Index,704
164,Yemen,2001,,Corruption Perception Index,887
165,Zambia,2001,0.74,Corruption Perception Index,894


In [95]:
# TODO how do we feed CPI values into this?
# https://altair-viz.github.io/gallery/choropleth.html
# https://groups.google.com/g/vega-js/c/JD3TjOlRbUQ

import altair as alt
from vega_datasets import data

#lookupData = wvsSummary.loc[wvsSummary['Variable'] == 'Feeling of happiness'].loc[wvsSummary['Year'] == 2001]
#lookupData = cpi_norm.loc[cpi_norm['Year'] == 2001]
source = alt.topo_feature(data.world_110m.url, 'countries')

alt.Chart(source).mark_geoshape(
    fill='#666666',
    stroke='white'
) + alt.Chart(source, title='Corruption Perception Worldwide').mark_geoshape(
    #fill='#666666',
    stroke='white'
).project(
    'naturalEarth1'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=lookupData, key='country-code', fields=['Value'])
).encode(
    color=alt.Color('Value:Q',  scale=alt.Scale(scheme='reds'))
).properties(
    width=800,
    height=400
)

In [154]:
#happiness2001 = wvsSummaryCC.loc[wvsSummaryCC['Year'] == 2006]
happiness = wvsSummaryCC.loc[wvsSummaryCC['Variable'] == 'Feeling of happiness']

#happiness2001['country-code'] = happiness2001['country-code'].astype(np.int64)
#happiness2001['country-code']
lookupData = pd.merge(happiness, countryCodes_noindex, on='Country')
lookupData

Unnamed: 0,Year,Country,Variable,Value,country-code
0,1982,South Africa,Feeling of happiness,0.682617,710
1,1990,South Africa,Feeling of happiness,0.65625,710
2,1996,South Africa,Feeling of happiness,0.71875,710
3,2001,South Africa,Feeling of happiness,0.739258,710
4,2006,South Africa,Feeling of happiness,0.744141,710
5,2013,South Africa,Feeling of happiness,0.708984,710
6,1982,United States of America,Feeling of happiness,0.739258,840
7,1990,United States of America,Feeling of happiness,0.759766,840
8,1995,United States of America,Feeling of happiness,0.799316,840
9,1999,United States of America,Feeling of happiness,0.777344,840


In [156]:
source = alt.topo_feature(data.world_110m.url, 'countries')

alt.Chart(source).mark_geoshape(
    fill='#666666',
    stroke='white'
) + alt.Chart(source, title='Feeling of happiness').mark_geoshape(
    #fill='#666666',
    stroke='white'
).project(
    'naturalEarth1'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=lookupData, key='country-code', fields=['Value'])
).encode(
    color=alt.Color('Value:Q',  scale=alt.Scale(scheme='blues'))
).properties(
    width=800,
    height=400
)

In [176]:
politics = wvsSummaryCC.loc[wvsSummaryCC['Variable'] == 'Political system: Having a democratic political system']
lookupData = pd.merge(politics, countryCodes_noindex, on='Country')
lookupData

Unnamed: 0,Year,Country,Variable,Value,country-code
0,1982,South Africa,Political system: Having a democratic politica...,,710
1,1990,South Africa,Political system: Having a democratic politica...,,710
2,1996,South Africa,Political system: Having a democratic politica...,0.787598,710
3,2001,South Africa,Political system: Having a democratic politica...,0.777344,710
4,2006,South Africa,Political system: Having a democratic politica...,0.797363,710
5,2013,South Africa,Political system: Having a democratic politica...,0.668457,710
6,1982,United States of America,Political system: Having a democratic politica...,,840
7,1990,United States of America,Political system: Having a democratic politica...,,840
8,1995,United States of America,Political system: Having a democratic politica...,0.809082,840
9,1999,United States of America,Political system: Having a democratic politica...,0.786621,840


In [178]:
source = alt.topo_feature(data.world_110m.url, 'countries')

alt.Chart(source).mark_geoshape(
    fill='#666666',
    stroke='white'
) + alt.Chart(source, title='Political system: Having a democratic political system').mark_geoshape(
    #fill='#666666',
    stroke='white'
).project(
    'naturalEarth1'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=lookupData, key='country-code', fields=['Value'])
).encode(
    color=alt.Color('Value:Q',  scale=alt.Scale(scheme='greens'))
).properties(
    width=800,
    height=400
)