# **Cleaning master data**

Here I create the master dataset using World Bank APIs, FMI and Economic Complexity data.

In [1]:
import wbgapi as wb # World bank
import pandas as pd
from weo import download, WEO # IMF data
import requests, io
import rdata   # or pyreadr
import os

### Variable selection

In [None]:

# World Bank
wb_variables = [

#    'NY.GDP.PCAP.CD', # GDP per capita (current US$) # Note: I download it in the IMF request
#    'NY.GNP.ATLS.CD', # GNI, Atlas method (current US$)

    # Resource Intensity
    
    'NY.GDP.TOTL.RT.ZS', # Total natural resources rents (% of GDP)

    'NV.IND.MANF.ZS',        # Manufacturing, value added (% of GDP)
    'NV.IND.TOTL.ZS',        # Industry (including construction), value added (% of GDP)
    'TX.VAL.TECH.MF.ZS',     # High-technology exports (% of manufactured exports)
    'NV.AGR.TOTL.ZS',        # Agriculture, forestry, and fishing, value added (% of GDP)
    'NV.SRV.TOTL.ZS',        # Services, value added (% of GDP)
    'NY.GDP.MINR.RT.ZS',     # Mineral rents (% of GDP)
    'NY.GDP.NGAS.RT.ZS',     # Natural gas rents (% of GDP)
    'NY.GDP.PETR.RT.ZS',      # Oil rents (% of GDP)
    
    'NY.ADJ.SVNG.CD', #  Adjusted savings: total (current US$)
    'NY.ADJ.DRES.GN.ZS', # Adjusted savings: natural resources depletion (% of GNI)    
    'IQ.CPA.HRES.XQ', # CPIA building human resources rating (1=low to 6=high)',
    'IQ.CPA.FINQ.XQ', # 'CPIA quality of budgetary and financial management rating (1=low to 6=high)',
    'IQ.CPA.TRAN.XQ', #'CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)',
    'DT.DOD.DIMF.CD' # Use of IMF credit (DOD, current US$)
    ]

# IMF 
imf_variables = [
    ("Gross domestic product per capita, constant prices",
     "Purchasing power parity; 2017 international dollar"),
    
    ("General government revenue",
     "Percent of GDP"),
]

# Economic Complexity
eci_variables = [
    'eci'
 ]

# V Democracy
vdem_variables = [
    # High-Level Democracy Indices
    'v2x_polyarchy', # Electoral democracy index
    'v2x_libdem', # Liberal democracy index
    'v2x_partipdem', # Participatory democracy index
    'v2x_delibdem', # Deliberative democracy index
    'v2x_egaldem', # Egalitarian democracy index
    
    # Mid-level indices: Components of Democracy Indices
    #'v2x_api',
    #'v2x_',
    #'v2x_delibdem',
]

# Penn World Table 
pwt_variables = [
    "country",
    "countrycode",
    "year",
    'hc' # Human capital index
    'cn' # Capital stock (national accounts prices)
    'ctfp' # TFP level (constant national prices)
    'cwtfp' # Welfare-relevant TFP    
]


### World Bank API

In [None]:
def download_wb_indicators(indicators, start_year, end_year):
    final_rows = []

    # Get all real countries
    economies = [c['id'] for c in wb.economy.list() if not c.get("aggregate", False)]
    
    for indicator in indicators:
        print(f"Downloading {indicator} ...")
        raw = wb.data.fetch(indicator, economy=economies, time=range(start_year, end_year + 1))
        
        for row in raw:
            iso = row.get("economy")
            year = int(row.get("time").replace("YR", ""))
            value = row.get("value")
            
            if iso is None or value is None:
                continue

            final_rows.append({
                "Country Code": iso,
                "Year": year,
                "Variable": indicator,
                "Value": value
            })

    df = pd.DataFrame(final_rows)
    return df

wb_df = download_wb_indicators(wb_variables, start_year=1990, end_year=2024)

In [None]:
# Get all economies (countries + aggregates)
all_economies = wb.economy.list()

# Filter only real countries (exclude aggregates)
countries = [c for c in all_economies if not c.get("aggregate", False)]

# Build a dataframe
country_names = pd.DataFrame({
    "Country Code": [c["id"] for c in countries],
    "Country Name": [c["value"] for c in countries]
})

country_names

Unnamed: 0,Country Code,Country Name
0,ABW,Aruba
1,AFG,Afghanistan
2,AGO,Angola
3,ALB,Albania
4,AND,Andorra
...,...,...
212,XKX,Kosovo
213,YEM,"Yemen, Rep."
214,ZAF,South Africa
215,ZMB,Zambia


#### IMF

In [None]:
path, _ = download(2024, "Apr")  # or adjust to whichever vintage you want
w = WEO(path)
frames = []
    
frames = []
for subj, unit in imf_variables:
    df = w.get(subj, unit).reset_index().rename(columns={"index": "COUNTRY"})
    df_long = df.melt(id_vars="COUNTRY", var_name="YEAR", value_name="VALUE")
    df_long["INDICATOR"] = subj
    frames.append(df_long)

imf_df = pd.concat(frames, ignore_index=True)
print(imf_df.head())

# Remode Weo data from code folder
os.remove("weo_2024_1.csv")

imf_df = imf_df.rename(columns={
    'COUNTRY': 'Year',
    'YEAR': 'Country Code',
    'INDICATOR': 'Variable',
    'VALUE': 'Value'
})
imf_df

Already downloaded 2024-Apr WEO dataset at weo_2024_1.csv


WEO_ParsingError: Subject must be one of 
Gross domestic product, constant prices, Gross domestic product, current prices, Gross domestic product, deflator, Gross domestic product per capita, constant prices, Gross domestic product per capita, current prices, Output gap in percent of potential GDP, Gross domestic product based on purchasing-power-parity (PPP) share of world total, Implied PPP conversion rate, Total investment, Gross national savings, Inflation, average consumer prices, Inflation, end of period consumer prices, Volume of imports of goods and services, Volume of Imports of goods, Volume of exports of goods and services, Volume of exports of goods, Unemployment rate, Employment, Population, General government revenue, General government total expenditure, General government net lending/borrowing, General government structural balance, General government primary net lending/borrowing, General government net debt, General government gross debt, Gross domestic product corresponding to fiscal year, current prices, Current account balance
Provided subject: Gross fixed capital formation, General government, Constant prices

#### Economic Complexity

In [None]:
# ECI
eci_df = (
    pd.read_csv('https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/rawdata/growth_proj_eci_rankings.csv') # Note: If error, update the raw link in here: https://github.com/AyaanTigdikar/Capstone/blob/main/rawdata/growth_proj_eci_rankings.csv
      .rename(columns={'country_iso3_code': 'country_code', 'eci_hs92': 'eci'})
      .drop(columns=['eci_rank_hs92'])
)

# Country codes
continent_labels = pd.read_csv('../../data/location_group_member.csv')

eci_clean = (
    eci_df
      .merge(
          continent_labels[continent_labels['group_type'] == 'continent'][['group_name', 'country_id']],
          on='country_id',
          how='left'
      )
      .rename(columns={'group_name': 'continent'})
      .drop_duplicates(subset='country_code', keep='first')
)

eci_df['Variable'] = 'Economic Complexity'

eci_df = eci_df.rename(columns={
  'country_code': 'Country Code',
  'year': 'Year',
  'eci': 'Value'
})

eci_df = eci_df[['Country Code', 'Year', 'Variable' , 'Value']]

### V democracy

In [None]:
# 1. Download the RData from GitHub “raw” URL
url = ("https://raw.githubusercontent.com/vdeminstitute/vdemdata/master/data/vdem.RData")
resp = requests.get(url)
resp.raise_for_status()  # check download succeeded

with open("vdem.RData", "wb") as f:
    f.write(resp.content)

# Load the .RData file
vdem_r = rdata.read_rda("vdem.RData")

# Read R data as a df
vdem = vdem_r.get("vdem")

# Remode RData from code folder
os.remove("vdem.RData")
    
var_list = ['country_name', 'country_text_id', 'year'] + vdem_variables

vdem = vdem[var_list]

vdem = vdem.rename(columns={
                   'country_name': 'Country Name',
                   'country_text_id': 'Country Code',
                   'year': 'Year',
})


vdem_df = pd.melt(
    vdem,
    id_vars=['Country Code','Year'],
    value_vars=['v2x_polyarchy','v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem'],
    var_name='Variable',
    value_name='Value'
)

vdem_df = vdem_df[vdem_df['Year'] >= 1990]



### Penn World Table

In [24]:
# Penn World Table 11.0 persistent DOI
pwt_df = (pd.read_csv('https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/rawdata/dataset_2025-12-23T17_49_47.423426845Z_DEFAULT_INTEGRATION_IMF.FAD_ICSD_1.0.0.csv')
      .rename(columns={'COUNTRY': 'Country Name', 'INDICATOR': 'Variable'})
      .drop(columns=['DATASET', 'SERIES_CODE', 'OBS_MEASURE','FREQUENCY','SCALE'])
)
pwt_df


Unnamed: 0,Country Name,Variable,1960,1961,1962,1963,1964,1965,1966,1967,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Honduras,"Gross fixed capital formation, General governm...",12.756312,13.026658,13.023451,13.616134,13.546257,12.574494,12.517730,12.507399,...,3.243165,3.278092,2.863663,3.710152,2.910118,2.557573,3.082348,3.801338,3.696413,3.400802
1,"Gambia, The","Gross fixed capital formation, Private sector,...",2.465086,2.721014,2.778718,2.856070,2.668935,2.518140,2.587320,2.690813,...,3.380725,3.670095,3.830114,4.440723,4.654298,6.195676,7.863171,5.505053,5.919426,6.300903
2,"Mauritania, Islamic Republic of","Gross fixed capital formation, Public private ...",,,,,,,,,...,0.000000,0.000000,0.194247,0.182304,0.169919,0.173352,0.183410,0.000000,0.000000,0.434214
3,Argentina,"Gross fixed capital formation, General governm...",1.306319,1.315757,1.359688,1.467980,1.416598,1.349154,1.399816,1.416332,...,2.382360,2.495679,2.254370,2.513430,2.541412,2.551094,2.504813,2.666863,2.440963,1.864936
4,Ireland,"Gross fixed capital formation, Private sector,...",12.740883,14.184085,15.815464,16.895134,17.849979,19.269285,18.540708,18.577796,...,14.185109,14.693995,17.678960,16.889427,18.366733,22.508676,33.560451,31.582582,26.502588,45.153927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,Cameroon,"Gross fixed capital formation, Private sector,...",4.713765,4.832491,4.881736,4.918539,4.932433,4.902847,4.858413,5.682614,...,12.486059,12.291298,12.094671,11.575095,11.904741,12.144837,11.053298,10.800065,12.422193,12.522708
484,St. Lucia,"Gross fixed capital formation, Private sector,...",,,,,,,,,...,19.734484,19.774293,19.124330,18.174141,14.945514,16.027401,17.708351,17.805430,18.235845,18.666254
485,"Egypt, Arab Republic of","Gross fixed capital formation, Private sector,...",0.565651,0.560639,0.571490,0.575546,0.509820,0.504807,0.526585,0.552463,...,5.549182,5.093437,4.729500,4.051543,3.764704,3.766882,3.820166,3.804432,3.483659,4.334251
486,Kenya,"Gross fixed capital formation, General governm...",3.631978,3.917251,3.763381,3.751719,3.682317,3.652224,3.394409,3.386676,...,4.208784,4.010175,4.372360,4.463024,3.938021,4.851337,6.456296,5.833935,5.369810,5.519358


## Merging and final cleaning

In [None]:
final_df = pd.concat([wb_df, eci_df, imf_df, vdem_df])

# Variable names
final_df['Variable'].unique()

rename_map = {
    "NV.IND.MANF.ZS": "Manufacturing",
    "NV.IND.TOTL.ZS": "Industry",
    "TX.VAL.TECH.MF.ZS": "High-tech exports",
    "NV.AGR.TOTL.ZS": "Agriculture",
    "NV.SRV.TOTL.ZS": "Services",
    'NY.GDP.TOTL.RT.ZS': 'Total natural resources rents (% of GDP)', # Total natural resources rents (% of GDP)
    'NY.GDP.MINR.RT.ZS': 'Mineral rents (% of GDP)',     # Mineral rents (% of GDP)
    'NY.GDP.NGAS.RT.ZS': 'Natural gas rents (% of GDP)',     # Natural gas rents (% of GDP)
    'NY.GDP.PETR.RT.ZS': 'Oil rents (% of GDP)',      # Oil rents (% of GDP)
    'NY.ADJ.SVNG.CD': 'Adjusted savings: total (current US$)',
    'NY.ADJ.DRES.GN.ZS': 'Adjusted savings: natural resources depletion (% of GNI)',
    'IQ.CPA.HRES.XQ': 'CPIA building human resources rating (1=low to 6=high)',
    'IQ.CPA.FINQ.XQ': 'CPIA quality of budgetary and financial management rating (1=low to 6=high)',
    'IQ.CPA.TRAN.XQ': 'CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)',
    'DT.DOD.DIMF.CD': 'Use of IMF credit (DOD, current US$)',
    
    "Economic Complexity": "Economic Complexity Index",
    "Gross domestic product per capita, constant prices": "GDP per capita (constant prices, PPP)",
    "General government revenue": "Government revenue",

    'v2x_polyarchy': 'electoral_dem', # Electoral democracy index
    'v2x_libdem': 'liberal_dem', # Liberal democracy index
    'v2x_partipdem': 'participatory_dem', # Participatory democracy index
    'v2x_delibdem': 'deliberative_dem', # Deliberative democracy index
    'v2x_egaldem': 'egalitarian_dem' # Egalitarian democracy index
}

final_df["Variable"] = final_df["Variable"].replace(rename_map)

# Countries names
final_df = final_df.merge(country_names, how='left', on='Country Code')

# Filter period
# If Year is a pandas Period, extract the year as int
final_df['Year'] = final_df['Year'].apply(lambda x: x.year if hasattr(x, 'year') else int(x))

final_df = final_df[(final_df['Year'] >= 1990) & (final_df['Year'] <= 2024)]


In [None]:
final_df

Unnamed: 0,Country Code,Year,Variable,Value,Country Name
0,ZWE,2021,NY.GDP.TOTL.RT.ZS,6.398452,Zimbabwe
1,ZWE,2020,NY.GDP.TOTL.RT.ZS,4.746668,Zimbabwe
2,ZWE,2019,NY.GDP.TOTL.RT.ZS,4.715765,Zimbabwe
3,ZWE,2018,NY.GDP.TOTL.RT.ZS,3.378189,Zimbabwe
4,ZWE,2017,NY.GDP.TOTL.RT.ZS,6.095448,Zimbabwe
...,...,...,...,...,...
116888,ZZB,2020,egalitarian_dem,0.249000,
116889,ZZB,2021,egalitarian_dem,0.256000,
116890,ZZB,2022,egalitarian_dem,0.265000,
116891,ZZB,2023,egalitarian_dem,0.262000,


#### To wide

In [None]:
final_df_wide = final_df.pivot(
    index=['Country Code', 'Year'],
    columns='Variable',
    values='Value'
).reset_index()

In [None]:
final_df_wide

Variable,Country Code,Year,Adjusted savings: natural resources depletion (% of GNI),Adjusted savings: total (current US$),Agriculture,Economic Complexity Index,"GDP per capita (constant prices, PPP)",Government revenue,High-tech exports,Industry,...,Mineral rents (% of GDP),Natural gas rents (% of GDP),Oil rents (% of GDP),Services,Total natural resources rents (% of GDP),deliberative_dem,egalitarian_dem,electoral_dem,liberal_dem,participatory_dem
0,ABW,1990,0.001568,,,,32967.377,,,,...,0.0,0.0,0.0,,0.001552,,,,,
1,ABW,1991,0.001649,,,,34292.324,,,,...,0.0,0.0,0.0,,0.001634,,,,,
2,ABW,1992,0.001393,,,,35073.111,,,,...,0.0,0.0,0.0,,0.001379,,,,,
3,ABW,1993,0.000941,,,,35244.590,,,,...,0.0,0.0,0.0,,0.000925,,,,,
4,ABW,1994,0.000953,,,,36215.205,,,,...,0.0,0.0,0.0,,0.000937,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7512,ZZB,2020,,,,,,,,,...,,,,,,0.263,0.249,0.271,0.219,0.174
7513,ZZB,2021,,,,,,,,,...,,,,,,0.273,0.256,0.285,0.232,0.180
7514,ZZB,2022,,,,,,,,,...,,,,,,0.278,0.265,0.294,0.240,0.182
7515,ZZB,2023,,,,,,,,,...,,,,,,0.281,0.262,0.298,0.242,0.186


## Saving data

In [None]:
final_df.to_csv('../workingdata/master_data_long.csv')

final_df_wide.to_csv('../workingdata/master_data_wide.csv')