# **Cleaning master data**

Here I create the master dataset using World Bank APIs, FMI and Economic Complexity data.

In [32]:
import wbgapi as wb # World bank
import pandas as pd
from weo import download, WEO # IMF data
import requests, io
import rdata   # or pyreadr
import os
import re

### Variable selection

In [33]:

# World Bank
wb_variables = [
#    'NY.GDP.PCAP.CD', # GDP per capita (current US$) # Note: I download it in the IMF request
#    'NY.GNP.ATLS.CD', # GNI, Atlas method (current US$)

    'NY.GDP.TOTL.RT.ZS', # Total natural resources rents (% of GDP)

    'NV.IND.MANF.ZS',        # Manufacturing, value added (% of GDP)
    'NV.IND.TOTL.ZS',        # Industry (including construction), value added (% of GDP)
    'TX.VAL.TECH.MF.ZS',     # High-technology exports (% of manufactured exports)
    'NV.AGR.TOTL.ZS',        # Agriculture, forestry, and fishing, value added (% of GDP)
    'NV.SRV.TOTL.ZS',        # Services, value added (% of GDP)
    'NY.GDP.MINR.RT.ZS',     # Mineral rents (% of GDP)
    'NY.GDP.NGAS.RT.ZS',     # Natural gas rents (% of GDP)
    'NY.GDP.PETR.RT.ZS',      # Oil rents (% of GDP)
    
    'NY.ADJ.SVNG.CD', #  Adjusted savings: total (current US$)
    'NY.ADJ.ICTR.GN.ZS', # Adjusted savings: gross savings (% of GNI)
    'NY.ADJ.DRES.GN.ZS', # Adjusted savings: natural resources depletion (% of GNI)    
#    'IQ.CPA.HRES.XQ', # CPIA building human resources rating (1=low to 6=high)',
#    'IQ.CPA.FINQ.XQ', # 'CPIA quality of budgetary and financial management rating (1=low to 6=high)',
#    'IQ.CPA.TRAN.XQ', #'CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)',
    'DT.DOD.DIMF.CD', # Use of IMF credit (DOD, current US$)
    'SL.IND.EMPL.ZS', # Employment in industry (% of total employment)
    'SL.SRV.EMPL.ZS', # Employment in services (% of total employment)
    'SL.AGR.EMPL.ZS', # Employment in agriculture (% of total employment)

    'EG.ELC.ACCS.ZS', # Access to electricity (% of population)
    'IT.CEL.SETS.P2', # Mobile cellular subscriptions (per 100 people)
    'FS.AST.PRVT.GD.ZS', # Domestic credit to private sector (% of GDP)
    'FR.INR.RINR', # Real interest rate (%)
    'FR.INR.LEND', # Lending interest rate (%)
    'FP.CPI.TOTL.ZG', # Inflation, consumer prices (annual %)
    'SP.URB.TOTL', # Urban population (% of total population) 
    'SP.DYN.LE00.IN', # Life expectancy at birth, total (years)
    
    'NE.TRD.GNFS.ZS' # Trade (% of GDP)
]

# IMF 
imf_variables = [
    ("Gross domestic product per capita, constant prices",
     "Purchasing power parity; 2017 international dollar"),
    
    ("General government revenue",
     "Percent of GDP"),
    
    ('General government net debt',
     'Percent of GDP'),
    
    ('General government structural balance', 
     'Percent of potential GDP')
]

# IMF-ICSD
imf_icsd_variables = [
    'P51G_S13_Q_POGDP_PT.A', # Gross fixed capital formation, General government, Constant prices, Percent of GDP
    'P51G_PS_Q_POGDP_PT.A', # Gross fixed capital formation, Private sector, Constant prices, Percent of GDP
    'P51G_PUPVT_Q_POGDP_PT.A' # Gross fixed capital formation, Public private partnership, Constant prices, Percent of GDP
]

# Economic Complexity
eci_variables = [
    'eci'
 ]

# V Democracy
vdem_variables = [
    # High-Level Democracy Indices
    'v2x_polyarchy', # Electoral democracy index
    'v2x_libdem', # Liberal democracy index
    'v2x_partipdem', # Participatory democracy index
    'v2x_delibdem', # Deliberative democracy index
    'v2x_egaldem', # Egalitarian democracy index
    
    'v2xnp_client', #  Clientelism Index (D)  
    'v2x_corr', # Political corruption index (D) 
    'v2x_rule', # Rule of law index (D) 
    'v2x_accountability', # Accountability index
    'v2xcl_prpty', # Property rights (D) 
    'e_wbgi_pve', # Political stability — estimate (E) 
    'e_civil_war' # Civil war (E)
]

# Penn World Table - FLAG THESE ARE IN NATIONAL CURRENCY
pwt_variables = [
    'hc', # Human capital index
    'cn', # Capital stock (national accounts prices) Unit: constant local currency (real terms)
    'ctfp', # TFP level (constant national prices) Unit: index
    'cwtfp', # Welfare-relevant TFP    
    'csh_c', # Share of consumption in GDP
    "csh_i", # Share of investment in GDP
    "csh_g", # Share of government spending in GDP
    "delta" # Capital depreciation rate
]


cepii_variables = [
    'landlocked', # dummy
]

### World Bank API

In [34]:

def download_wb_indicators(indicators, start_year, end_year):
    final_rows = []

    # Get all real countries
    economies = [c['id'] for c in wb.economy.list() if not c.get("aggregate", False)]
    
    for indicator in indicators:
        print(f"Downloading {indicator} ...")
        raw = wb.data.fetch(indicator, economy=economies, time=range(start_year, end_year + 1))
        
        for row in raw:
            iso = row.get("economy")
            year = int(row.get("time").replace("YR", ""))
            value = row.get("value")
            
            if iso is None or value is None:
                continue

            final_rows.append({
                "Country Code": iso,
                "Year": year,
                "Variable": indicator,
                "Value": value
            })

    df = pd.DataFrame(final_rows)
    return df

wb_df = download_wb_indicators(wb_variables, start_year=1995, end_year=2019)

wb_df

Downloading NY.GDP.TOTL.RT.ZS ...
Downloading NV.IND.MANF.ZS ...
Downloading NV.IND.TOTL.ZS ...
Downloading TX.VAL.TECH.MF.ZS ...
Downloading NV.AGR.TOTL.ZS ...
Downloading NV.SRV.TOTL.ZS ...
Downloading NY.GDP.MINR.RT.ZS ...
Downloading NY.GDP.NGAS.RT.ZS ...
Downloading NY.GDP.PETR.RT.ZS ...
Downloading NY.ADJ.SVNG.CD ...
Downloading NY.ADJ.ICTR.GN.ZS ...
Downloading NY.ADJ.DRES.GN.ZS ...
Downloading DT.DOD.DIMF.CD ...
Downloading SL.IND.EMPL.ZS ...
Downloading SL.SRV.EMPL.ZS ...
Downloading SL.AGR.EMPL.ZS ...
Downloading EG.ELC.ACCS.ZS ...
Downloading IT.CEL.SETS.P2 ...
Downloading FS.AST.PRVT.GD.ZS ...
Downloading FR.INR.RINR ...
Downloading FR.INR.LEND ...
Downloading FP.CPI.TOTL.ZG ...
Downloading SP.URB.TOTL ...
Downloading SP.DYN.LE00.IN ...
Downloading NE.TRD.GNFS.ZS ...


Unnamed: 0,Country Code,Year,Variable,Value
0,ZWE,2019,NY.GDP.TOTL.RT.ZS,4.715765
1,ZWE,2018,NY.GDP.TOTL.RT.ZS,3.378189
2,ZWE,2017,NY.GDP.TOTL.RT.ZS,6.095448
3,ZWE,2016,NY.GDP.TOTL.RT.ZS,4.495414
4,ZWE,2015,NY.GDP.TOTL.RT.ZS,4.606185
...,...,...,...,...
108406,ABW,1999,NE.TRD.GNFS.ZS,164.559014
108407,ABW,1998,NE.TRD.GNFS.ZS,163.267360
108408,ABW,1997,NE.TRD.GNFS.ZS,168.781911
108409,ABW,1996,NE.TRD.GNFS.ZS,175.344130


In [35]:
# Get all economies (countries + aggregates)
all_economies = wb.economy.list()

# Filter only real countries (exclude aggregates)
countries = [c for c in all_economies if not c.get("aggregate", False)]

# Build a dataframe
country_names = pd.DataFrame({
    "Country Code": [c["id"] for c in countries],
    "Country Name": [c["value"] for c in countries]
})

country_names

Unnamed: 0,Country Code,Country Name
0,ABW,Aruba
1,AFG,Afghanistan
2,AGO,Angola
3,ALB,Albania
4,AND,Andorra
...,...,...
212,XKX,Kosovo
213,YEM,"Yemen, Rep."
214,ZAF,South Africa
215,ZMB,Zambia


#### IMF

In [36]:
path, _ = download(2024, "Apr")  # Last dataset
w = WEO(path)
frames = []
    
frames = []
for subj, unit in imf_variables:
    df = w.get(subj, unit).reset_index().rename(columns={"index": "COUNTRY"})
    df_long = df.melt(id_vars="COUNTRY", var_name="YEAR", value_name="VALUE")
    df_long["INDICATOR"] = subj
    frames.append(df_long)

imf_df = pd.concat(frames, ignore_index=True)
print(imf_df.head())

# Remode Weo data from code folder
os.remove("weo_2024_1.csv")

imf_df = imf_df.rename(columns={
    'COUNTRY': 'Year',
    'YEAR': 'Country Code',
    'INDICATOR': 'Variable',
    'VALUE': 'Value'
})
imf_df

weo_2024_1.csv 19.2Mb
Downloaded 2024-Apr WEO dataset
  COUNTRY YEAR  VALUE                                          INDICATOR
0    1980  AFG    NaN  Gross domestic product per capita, constant pr...
1    1981  AFG    NaN  Gross domestic product per capita, constant pr...
2    1982  AFG    NaN  Gross domestic product per capita, constant pr...
3    1983  AFG    NaN  Gross domestic product per capita, constant pr...
4    1984  AFG    NaN  Gross domestic product per capita, constant pr...


Unnamed: 0,Year,Country Code,Value,Variable
0,1980,AFG,,"Gross domestic product per capita, constant pr..."
1,1981,AFG,,"Gross domestic product per capita, constant pr..."
2,1982,AFG,,"Gross domestic product per capita, constant pr..."
3,1983,AFG,,"Gross domestic product per capita, constant pr..."
4,1984,AFG,,"Gross domestic product per capita, constant pr..."
...,...,...,...,...
39195,2025,ZWE,,General government structural balance
39196,2026,ZWE,,General government structural balance
39197,2027,ZWE,,General government structural balance
39198,2028,ZWE,,General government structural balance


#### IMF-ICSD

In [37]:
pattern = "|".join(map(re.escape, imf_icsd_variables))

imf_icsd_df = (
    pd.read_csv("https://raw.githubusercontent.com/AyaanTigdikar/Capstone/main/rawdata/dataset_2025-12-23T17_49_47.423426845Z_DEFAULT_INTEGRATION_IMF.FAD_ICSD_1.0.0.csv")
    .rename(columns={
        "COUNTRY": "Country Name",
        "INDICATOR": "Variable"
    })
    .loc[lambda df: df["SERIES_CODE"].str.contains(pattern, na=False)] # Keep only selected variables
    .assign(Country_Code=lambda df: df["SERIES_CODE"].str[:3]) # Create Country Code variable with first 3 chars of SERIES_CODE
    .drop(columns=[
        "DATASET","SERIES_CODE", "OBS_MEASURE", "FREQUENCY", "SCALE"])
    .rename(columns={'Country_Code': 'Country Code'}))


imf_icsd_df = pd.melt(
    imf_icsd_df,
    id_vars=['Country Code','Variable'],
    value_vars= ['1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'],
    var_name='Year',
    value_name='Value'
)

imf_icsd_df


Unnamed: 0,Country Code,Variable,Year,Value
0,HND,"Gross fixed capital formation, General governm...",1995,10.037575
1,GMB,"Gross fixed capital formation, Private sector,...",1995,5.484142
2,MRT,"Gross fixed capital formation, Public private ...",1995,0.000000
3,ARG,"Gross fixed capital formation, General governm...",1995,1.295047
4,IRL,"Gross fixed capital formation, Private sector,...",1995,16.384628
...,...,...,...,...
12195,CMR,"Gross fixed capital formation, Private sector,...",2019,12.522708
12196,LCA,"Gross fixed capital formation, Private sector,...",2019,18.666254
12197,EGY,"Gross fixed capital formation, Private sector,...",2019,4.334251
12198,KEN,"Gross fixed capital formation, General governm...",2019,5.519358


#### Economic Complexity

In [38]:
# ECI
eci_df = (
    pd.read_csv('https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/rawdata/growth_proj_eci_rankings.csv') # Note: If error, update the raw link in here: https://github.com/AyaanTigdikar/Capstone/blob/main/rawdata/growth_proj_eci_rankings.csv
      .rename(columns={'country_iso3_code': 'country_code', 'eci_hs92': 'eci'})
      .drop(columns=['eci_rank_hs92'])
)

# Country codes
continent_labels = pd.read_csv('../../data/location_group_member.csv')

eci_clean = (
    eci_df
      .merge(
          continent_labels[continent_labels['group_type'] == 'continent'][['group_name', 'country_id']],
          on='country_id',
          how='left'
      )
      .rename(columns={'group_name': 'continent'})
      .drop_duplicates(subset='country_code', keep='first')
)

eci_df['Variable'] = 'Economic Complexity'

eci_df = eci_df.rename(columns={
  'country_code': 'Country Code',
  'year': 'Year',
  'eci': 'Value'
})

eci_df = eci_df[['Country Code', 'Year', 'Variable' , 'Value']]



### V democracy

In [39]:
# 1. Download the RData from GitHub “raw” URL
url = ("https://raw.githubusercontent.com/vdeminstitute/vdemdata/master/data/vdem.RData")
resp = requests.get(url)
resp.raise_for_status()  # check download succeeded

with open("vdem.RData", "wb") as f:
    f.write(resp.content)

# Load the .RData file
vdem_r = rdata.read_rda("vdem.RData")

# Read R data as a df
vdem = vdem_r.get("vdem")

# Remode RData from code folder
os.remove("vdem.RData")
    
var_list = ['country_name', 'country_text_id', 'year'] + vdem_variables

vdem = vdem[var_list]

vdem = vdem.rename(columns={
                   'country_name': 'Country Name',
                   'country_text_id': 'Country Code',
                   'year': 'Year',
})  
    
vdem_df = pd.melt(
    vdem,
    id_vars=['Country Code','Year'],
    value_vars= vdem_variables,
    var_name='Variable',
    value_name='Value'
)

vdem_df = vdem_df[vdem_df['Year'] >= 1995]

vdem_df



Unnamed: 0,Country Code,Year,Variable,Value
206,MEX,1995.0,v2x_polyarchy,0.480
207,MEX,1996.0,v2x_polyarchy,0.508
208,MEX,1997.0,v2x_polyarchy,0.556
209,MEX,1998.0,v2x_polyarchy,0.598
210,MEX,1999.0,v2x_polyarchy,0.602
...,...,...,...,...
333552,ZZB,2020.0,e_civil_war,
333553,ZZB,2021.0,e_civil_war,
333554,ZZB,2022.0,e_civil_war,
333555,ZZB,2023.0,e_civil_war,


### Penn World Table

In [40]:
# Penn World Table 11.0
url = "https://raw.githubusercontent.com/AyaanTigdikar/Capstone/main/rawdata/pwt110.xlsx"
pwt_df = (pd.read_excel(url, engine="openpyxl", sheet_name='Data')
          .rename(columns={'countrycode': 'Country Code', 
                           'country': 'Country Name', 
                           'year': 'Year'}))

pwt_df = pwt_df[['Country Code','Country Name','Year'] + pwt_variables]

pwt_df = pwt_df.melt(
    id_vars=['Country Code', 'Year'],
    value_vars=pwt_variables,
    var_name='Variable',
    value_name='Value'
)

pwt_df = pwt_df[(pwt_df['Year'] >= 1995) & (pwt_df['Year'] <= 2019)]

pwt_df

Unnamed: 0,Country Code,Year,Variable,Value
45,ABW,1995,hc,
46,ABW,1996,hc,
47,ABW,1997,hc,
48,ABW,1998,hc,
49,ABW,1999,hc,
...,...,...,...,...
109511,ZWE,2015,delta,0.054977
109512,ZWE,2016,delta,0.056414
109513,ZWE,2017,delta,0.057745
109514,ZWE,2018,delta,0.059023


### CEPII

In [41]:

url = "https://raw.githubusercontent.com/AyaanTigdikar/Capstone/main/rawdata/geo_cepii.xls"
cepii_df = (pd.read_excel(url,  sheet_name='geo_cepii')).rename(columns={'iso3': 'Country Code'}).drop_duplicates(subset=["Country Code"])

cepii_df = cepii_df[['Country Code'] + cepii_variables]

cepii_df['Variable'] = 'Landlocked'

cepii_df = cepii_df.rename(columns={'landlocked':'Value'})

years = pd.DataFrame({"Year": range(1995, 2020)}) # Expanding df to all years 


cepii_df = (
    cepii_df
    .merge(years, how="cross")
)

cepii_df

Unnamed: 0,Country Code,Value,Variable,Year
0,ABW,0,Landlocked,1995
1,ABW,0,Landlocked,1996
2,ABW,0,Landlocked,1997
3,ABW,0,Landlocked,1998
4,ABW,0,Landlocked,1999
...,...,...,...,...
5620,ZWE,1,Landlocked,2015
5621,ZWE,1,Landlocked,2016
5622,ZWE,1,Landlocked,2017
5623,ZWE,1,Landlocked,2018


## Merging and final cleaning

In [42]:
final_df = pd.concat([wb_df, eci_df, imf_df, imf_icsd_df, vdem_df, pwt_df, cepii_df])

# Variable renames
rename_map = {
    # WBI
    "NV.IND.MANF.ZS": "Manufacturing",
    "NV.IND.TOTL.ZS": "Industry",
    "TX.VAL.TECH.MF.ZS": "High-tech exports",
    "NV.AGR.TOTL.ZS": "Agriculture",
    "NV.SRV.TOTL.ZS": "Services",
    'NY.GDP.TOTL.RT.ZS': 'Total natural resources rents (% of GDP)', # Total natural resources rents (% of GDP)
    'NY.GDP.MINR.RT.ZS': 'Mineral rents (% of GDP)',     # Mineral rents (% of GDP)
    'NY.GDP.NGAS.RT.ZS': 'Natural gas rents (% of GDP)',     # Natural gas rents (% of GDP)
    'NY.GDP.PETR.RT.ZS': 'Oil rents (% of GDP)',      # Oil rents (% of GDP)
    'NY.ADJ.SVNG.CD': 'Adjusted savings: total (current US$)',
    'NY.ADJ.ICTR.GN.ZS': 'Adjusted savings: gross savings (% of GNI)',  
    'NY.ADJ.DRES.GN.ZS': 'Adjusted savings: natural resources depletion (% of GNI)',
    'IQ.CPA.HRES.XQ': 'CPIA building human resources rating (1=low to 6=high)',
    'IQ.CPA.FINQ.XQ': 'CPIA quality of budgetary and financial management rating (1=low to 6=high)',
    'IQ.CPA.TRAN.XQ': 'CPIA transparency, accountability and corruption in the public sector rating (1=low to 6=high)',
    'DT.DOD.DIMF.CD': 'Use of IMF credit (DOD, current US$)',
    'SL.IND.EMPL.ZS': 'Employment in industry (% of total employment)',
    'SL.SRV.EMPL.ZS': 'Employment in services (% of total employment)',
    'SL.AGR.EMPL.ZS': 'Employment in agriculture (% of total employment)',
    'EG.ELC.ACCS.ZS': 'Access to electricity (% of population)',
    'IT.CEL.SETS.P2': 'Mobile cellular subscriptions (per 100 people)',
    'FS.AST.PRVT.GD.ZS': 'Domestic credit to private sector (% of GDP)',
    'FR.INR.RINR': 'Real interest rate (%)',
    'FR.INR.LEND': 'Lending interest rate (%)',
    'FP.CPI.TOTL.ZG': 'Inflation, consumer prices (annual %)',
    'SP.URB.TOTL': 'Urban population (% of total population)', 
    'SP.DYN.LE00.IN': 'Life expectancy at birth, total (years)',
    'NE.TRD.GNFS.ZS': 'Trade (% of GDP)',
    
    # ECI
    "Economic Complexity": "Economic Complexity Index",

    # IMF
    "Gross domestic product per capita, constant prices": "GDP per capita (constant prices, PPP)",
    "General government revenue": "Government revenue",
    
    # IMF-ICSD
    # Already with a descriptive name

    # VDEM
    'v2x_polyarchy': 'electoral_dem', # Electoral democracy index
    'v2x_libdem': 'liberal_dem', # Liberal democracy index
    'v2x_partipdem': 'participatory_dem', # Participatory democracy index
    'v2x_delibdem': 'deliberative_dem', # Deliberative democracy index
    'v2x_egaldem': 'egalitarian_dem', # Egalitarian democracy index
    
    'v2xnp_client': 'Clientelism index',  
    'v2x_corr': 'Political corruption index', 
    'v2x_rule': 'Rule of law index', 
    'v2x_accountability': 'Accountability index',
    'v2xcl_prpty': 'Property rights', 
    'e_wbgi_pve': 'Political stability — estimate', 
    'e_civil_war': 'Civil war',
    
    # PWT
    'hc': 'Human capital index',
    'cn': 'Capital stock (national accounts prices)',
    'ctfp': 'TFP level (constant national prices)',
    'cwtfp': 'Welfare-relevant TFP',   
    'csh_c': 'Share of consumption in GDP',
    "csh_i":  'Share of investment in GDP',
    "csh_g": 'Share of government spending in GDP',
    "delta": 'Capital depreciation rate',
    
    # CEPII
    'landlocked': 'Landlocked'
}


final_df["Variable"] = final_df["Variable"].replace(rename_map)

# Countries names
final_df = final_df.merge(country_names, how='left', on='Country Code')

# Filter period
# If Year is a pandas Period, extract the year as int
final_df['Year'] = final_df['Year'].apply(lambda x: x.year if hasattr(x, 'year') else int(x))

final_df = final_df[(final_df['Year'] >= 1995) & (final_df['Year'] <= 2019)]


# Dropping areas that are not countries:
not_countries = [
    "HKG",  # Hong Kong
    "MAC",  # Macao
    "PRI",  # Puerto Rico
    "VIR",  # U.S. Virgin Islands
    "GUM",  # Guam
    "ASM",  # American Samoa
    "CYM",  # Cayman Islands
    "BMU",  # Bermuda
    "GRL",  # Greenland
    "MAF",  # Saint Martin (French part)
    "SXM",  # Sint Maarten (Dutch part)
    "CUW",  # Curaçao
    "ABW",  # Aruba
    "FRO",  # Faroe Islands
    "MNP",  # Northern Mariana Islands
    "PYF",  # French Polynesia
]
final_df = final_df[~final_df["Country Code"].isin(not_countries)]


### Important variables only

In [43]:
# Selected variables
important_vars = [
    # V dem indexes
    'Rule of law index',
    'Property rights',
    'Political stability — estimate',
    
    # Macro
    "GDP per capita (constant prices, PPP)",
    
    # GDP structure
    "Economic Complexity Index", 
    'Agriculture',
    "Industry",
    'Manufacturing',
    'Services',

    # Finance    
    'Adjusted savings: gross savings (% of GNI)',
    'Gross fixed capital formation, General government, Constant prices, Percent of GDP',
    'Gross fixed capital formation, Private sector, Constant prices, Percent of GDP',
    'Gross fixed capital formation, Public private partnership, Constant prices, Percent of GDP',
    'Capital depreciation rate',
    
    # Economics+
    'Human capital index',    
    'Life expectancy at birth, total (years)'
    'Mobile cellular subscriptions (per 100 people)',
    'Urban population (% of total population)', 
    
    # Resources
    'Mineral rents (% of GDP)',
    'Natural gas rents (% of GDP)', 'Oil rents (% of GDP)',
    'Total natural resources rents (% of GDP)'   
]

final_df["Important_vars"] = final_df["Variable"].isin(important_vars).astype(int)

#### To wide

In [44]:
final_df

Unnamed: 0,Country Code,Year,Variable,Value,Country Name,Important_vars
0,ZWE,2019,Total natural resources rents (% of GDP),4.715765,Zimbabwe,1
1,ZWE,2018,Total natural resources rents (% of GDP),3.378189,Zimbabwe,1
2,ZWE,2017,Total natural resources rents (% of GDP),6.095448,Zimbabwe,1
3,ZWE,2016,Total natural resources rents (% of GDP),4.495414,Zimbabwe,1
4,ZWE,2015,Total natural resources rents (% of GDP),4.606185,Zimbabwe,1
...,...,...,...,...,...,...
270630,ZWE,2015,Landlocked,1.000000,Zimbabwe,0
270631,ZWE,2016,Landlocked,1.000000,Zimbabwe,0
270632,ZWE,2017,Landlocked,1.000000,Zimbabwe,0
270633,ZWE,2018,Landlocked,1.000000,Zimbabwe,0


In [45]:
final_df_wide = final_df.pivot(
    index=['Country Code', 'Country Name', 'Year'],
    columns='Variable',
    values='Value'
).reset_index()

In [46]:
final_df_wide

Variable,Country Code,Country Name,Year,Access to electricity (% of population),Accountability index,Adjusted savings: gross savings (% of GNI),Adjusted savings: natural resources depletion (% of GNI),Adjusted savings: total (current US$),Agriculture,Capital depreciation rate,...,Total natural resources rents (% of GDP),Trade (% of GDP),Urban population (% of total population),"Use of IMF credit (DOD, current US$)",Welfare-relevant TFP,deliberative_dem,egalitarian_dem,electoral_dem,liberal_dem,participatory_dem
0,AFG,Afghanistan,1995,,-1.118,,,,,,...,,,3043587.0,,,0.027,0.041,0.094,0.022,0.025
1,AFG,Afghanistan,1996,,-1.582,,,,,,...,,,3190220.0,,,0.008,0.036,0.076,0.019,0.013
2,AFG,Afghanistan,1997,,-1.697,,,,,,...,,,3338510.0,,,0.007,0.040,0.073,0.022,0.013
3,AFG,Afghanistan,1998,,-1.695,,,,,,...,,,3493981.0,,,0.007,0.040,0.073,0.022,0.013
4,AFG,Afghanistan,1999,,-1.689,,,,,,...,,,3657429.0,,,0.007,0.040,0.073,0.022,0.013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5783,ZZB,,2015,,0.100,,,,,,...,,,,,,0.249,0.257,0.288,0.232,0.196
5784,ZZB,,2016,,0.016,,,,,,...,,,,,,0.233,0.241,0.251,0.211,0.176
5785,ZZB,,2017,,0.007,,,,,,...,,,,,,0.239,0.237,0.265,0.218,0.180
5786,ZZB,,2018,,0.020,,,,,,...,,,,,,0.255,0.235,0.268,0.223,0.182


In [47]:
final_df_wide.shape # Country name + Country Code + Year + 45 variables.

(5788, 57)

## Saving data

In [48]:
final_df.to_csv('../workingdata/master_data_long.csv')

final_df_wide.to_csv('../workingdata/master_data_wide.csv')

In [49]:
# Variables to include:
# Urbanization / infrastructure
# Poor geography
# Real interest rate