In [77]:
import pandas as pd
from pandas import DataFrame
from typing import Set, Any


def remove_others(df: DataFrame, columns: Set[Any]):
    cols_total: Set[Any] = set(df.columns)
    diff: Set[Any] = cols_total - columns
    df.drop(diff, axis=1, inplace=True)

# Opeing csv as a dataframe
df_GDP = pd.read_csv('Data/GDP.csv')
df_GDP
remove_others(df_GDP, {"Country Name", "Country Code", "2020"})
df_GDP = df_GDP.rename(columns={"2020": "GDP (current US$)"})
df_GDP


Unnamed: 0,Country Name,Country Code,GDP (current US$)
0,Aruba,ABW,2.496648e+09
1,Africa Eastern and Southern,AFE,9.216460e+11
2,Afghanistan,AFG,2.011614e+10
3,Africa Western and Central,AFW,7.844460e+11
4,Angola,AGO,5.361907e+10
...,...,...,...
261,Kosovo,XKX,7.716925e+09
262,"Yemen, Rep.",YEM,1.884051e+10
263,South Africa,ZAF,3.354420e+11
264,Zambia,ZMB,1.811063e+10


In [43]:
# Opening the HCI data as a dataframe

df_HCI = pd.read_csv('Data/HCI.csv')
df_HCI
remove_others(df_HCI, {"Country Name", "Country Code", "2020"})
df_HCI = df_HCI.rename(columns={"2020": "Human Capital Index (HCI)"})
df_HCI


Unnamed: 0,Country Name,Country Code,Human Capital Index (HCI)
0,Aruba,ABW,
1,Afghanistan,AFG,0.400284
2,Angola,AGO,0.362405
3,Albania,ALB,0.634251
4,Andorra,AND,
...,...,...,...
212,Kosovo,XKX,0.567332
213,"Yemen, Rep.",YEM,0.372784
214,South Africa,ZAF,0.425453
215,Zambia,ZMB,0.396928


In [44]:
# Opening the population csv file
df_Population = pd.read_csv('Data/Population.csv')
df_Population
remove_others(df_Population, {"Country Name", "Country Code", "2020"})
df_Population= df_Population.rename(columns={"2020": "Population"})
df_Population


Unnamed: 0,Country Name,Country Code,Population
0,Aruba,ABW,106766.0
1,Africa Eastern and Southern,AFE,677243299.0
2,Afghanistan,AFG,38928341.0
3,Africa Western and Central,AFW,458803476.0
4,Angola,AGO,32866268.0
...,...,...,...
261,Kosovo,XKX,1790133.0
262,"Yemen, Rep.",YEM,29825968.0
263,South Africa,ZAF,59308690.0
264,Zambia,ZMB,18383956.0


In [46]:
# Opeingn the tax reveneu as percentage of gdp csv and renaming the column

df_TaxRevenuePercentage = pd.read_csv('Data/Tax revenue percentage of GDP.csv')
df_TaxRevenuePercentage
remove_others(df_TaxRevenuePercentage, {"Country Name", "Country Code", "2020"})
df_TaxRevenuePercentage = df_TaxRevenuePercentage.rename(columns={"2020": "Tax revenue % of GDP"})
df_TaxRevenuePercentage


Unnamed: 0,Country Name,Country Code,Tax revenue % of GDP
0,Aruba,ABW,
1,Africa Eastern and Southern,AFE,
2,Afghanistan,AFG,
3,Africa Western and Central,AFW,
4,Angola,AGO,
...,...,...,...
261,Kosovo,XKX,
262,"Yemen, Rep.",YEM,
263,South Africa,ZAF,23.453792
264,Zambia,ZMB,16.442597


In [80]:
from functools import reduce

# Merging the dataframes togther by country code
dfs = [df_GDP,df_TaxRevenuePercentage, df_Population, df_HCI]
merged_df = reduce(lambda left, right: pd.merge(left, right, on=['Country Name', 'Country Code'], how='outer'), dfs)


In [81]:
merged_df
# Checking the dataframe

Unnamed: 0,Country Name,Country Code,GDP (current US$),Tax revenue % of GDP,Population,Human Capital Index (HCI)
0,Aruba,ABW,2.496648e+09,,106766.0,
1,Africa Eastern and Southern,AFE,9.216460e+11,,677243299.0,
2,Afghanistan,AFG,2.011614e+10,,38928341.0,0.400284
3,Africa Western and Central,AFW,7.844460e+11,,458803476.0,
4,Angola,AGO,5.361907e+10,,32866268.0,0.362405
...,...,...,...,...,...,...
263,South Africa,ZAF,3.354420e+11,23.453792,59308690.0,0.425453
264,Zambia,ZMB,1.811063e+10,16.442597,18383956.0,0.396928
265,Zimbabwe,ZWE,1.805117e+10,,14862927.0,0.466893
266,Czech Republic,CZE,,,,0.752257


In [82]:
df_clean = merged_df.dropna()
# Dropping rows with NaN values

In [83]:
df_clean = df_clean.reset_index(drop=True)
df_clean

Unnamed: 0,Country Name,Country Code,GDP (current US$),Tax revenue % of GDP,Population,Human Capital Index (HCI)
0,Albania,ALB,1.513187e+10,17.017907,2837849.0,0.634251
1,United Arab Emirates,ARE,3.588690e+11,0.664230,9890400.0,0.673012
2,Argentina,ARG,3.895910e+11,10.758498,45376763.0,0.602145
3,Armenia,ARM,1.264121e+10,21.861263,2963234.0,0.578999
4,Australia,AUS,1.327840e+12,22.596026,25693267.0,0.770240
...,...,...,...,...,...,...
106,Uzbekistan,UZB,5.989431e+10,14.867431,34232050.0,0.622806
107,Vanuatu,VUT,8.968279e+08,14.237408,307150.0,0.454659
108,Samoa,WSM,8.071475e+08,26.284608,198410.0,0.548422
109,South Africa,ZAF,3.354420e+11,23.453792,59308690.0,0.425453


In [102]:
# Calculating per capita data

df_clean['Total tax revenue per capita'] = df_clean['GDP (current US$)'] * df_clean['Tax revenue % of GDP'] / 100 / df_clean['Population']
df_clean['GDP per capita'] = df_clean['GDP (current US$)'] / df_clean['Population']
df_clean


Unnamed: 0,Country Name,Country Code,GDP (current US$),Tax revenue % of GDP,Population,Human Capital Index (HCI),Total tax revenue per capita,GDP per capita
0,Albania,ALB,1.513187e+10,17.017907,2837849.0,0.634251,907.422108,5332.160475
1,United Arab Emirates,ARE,3.588690e+11,0.664230,9890400.0,0.673012,241.013105,36284.578986
2,Argentina,ARG,3.895910e+11,10.758498,45376763.0,0.602145,923.691726,8585.693960
3,Armenia,ARM,1.264121e+10,21.861263,2963234.0,0.578999,932.605418,4266.018074
4,Australia,AUS,1.327840e+12,22.596026,25693267.0,0.770240,11677.731525,51680.465548
...,...,...,...,...,...,...,...,...
106,Uzbekistan,UZB,5.989431e+10,14.867431,34232050.0,0.622806,260.128880,1749.655815
107,Vanuatu,VUT,8.968279e+08,14.237408,307150.0,0.454659,415.709070,2919.836800
108,Samoa,WSM,8.071475e+08,26.284608,198410.0,0.548422,1069.278580,4068.078865
109,South Africa,ZAF,3.354420e+11,23.453792,59308690.0,0.425453,1326.515037,5655.865945


In [103]:
import json
# Exporting dataframe as a json.
exported_values = list(df_clean.T.to_dict().values())
open('World bank data.json' , 'w').write(json.dumps(exported_values))


31431