# **Cleaning master data**

Here I create the master dataset using World Bank APIs, FMI and Economic Complexity data.

In [47]:
import wbgapi as wb # World bank
import pandas as pd
from weo import download, WEO # IMF data
import requests

### Variable selection

In [48]:
# World Bank
wb_variables = [

#    'NY.GDP.PCAP.CD', # GDP per capita (current US$) # Note: I download it in the IMF request
#    'NY.GNP.ATLS.CD', # GNI, Atlas method (current US$)

    # Resource Intensity
    'NY.GDP.TOTL.RT.ZS', # Total natural resources rents (% of GDP)

    'NV.IND.MANF.ZS',        # Manufacturing, value added (% of GDP)
    'NV.IND.TOTL.ZS',        # Industry (including construction), value added (% of GDP)
    'TX.VAL.TECH.MF.ZS',     # High-technology exports (% of manufactured exports)
    'NV.AGR.TOTL.ZS',        # Agriculture, forestry, and fishing, value added (% of GDP)
    'NV.SRV.TOTL.ZS',        # Services, value added (% of GDP)
    'NY.GDP.MINR.RT.ZS',     # Mineral rents (% of GDP)
    'NY.GDP.NGAS.RT.ZS',     # Natural gas rents (% of GDP)
    'NY.GDP.PETR.RT.ZS'      # Oil rents (% of GDP)
    ]

# IMF 
imf_variables = [
    ("Gross domestic product per capita, constant prices", "Purchasing power parity; 2017 international dollar"),
    ("General government revenue", "Percent of GDP"),
]

# Economic Complexity
eci_variables = [
    'eci'
 ]

### World Bank API

In [49]:
def download_wb_indicators(indicators, start_year, end_year):
    final_rows = []

    # Get all real countries
    economies = [c['id'] for c in wb.economy.list() if not c.get("aggregate", False)]
    
    for indicator in indicators:
        print(f"Downloading {indicator} ...")
        raw = wb.data.fetch(indicator, economy=economies, time=range(start_year, end_year + 1))
        
        for row in raw:
            iso = row.get("economy")
            year = int(row.get("time").replace("YR", ""))
            value = row.get("value")
            
            if iso is None or value is None:
                continue

            final_rows.append({
                "Country Code": iso,
                "Year": year,
                "Variable": indicator,
                "Value": value
            })

    df = pd.DataFrame(final_rows)
    return df

In [50]:
wb_df = download_wb_indicators(wb_variables, start_year=1990, end_year=2024)

Downloading NY.GDP.TOTL.RT.ZS ...
Downloading NV.IND.MANF.ZS ...
Downloading NV.IND.TOTL.ZS ...
Downloading TX.VAL.TECH.MF.ZS ...
Downloading NV.AGR.TOTL.ZS ...
Downloading NV.SRV.TOTL.ZS ...
Downloading NY.GDP.MINR.RT.ZS ...
Downloading NY.GDP.NGAS.RT.ZS ...
Downloading NY.GDP.PETR.RT.ZS ...


In [51]:
# Get all economies (countries + aggregates)
all_economies = wb.economy.list()

# Filter only real countries (exclude aggregates)
countries = [c for c in all_economies if not c.get("aggregate", False)]

# Build a dataframe
country_names = pd.DataFrame({
    "Country Code": [c["id"] for c in countries],
    "Country Name": [c["value"] for c in countries]
})

country_names

Unnamed: 0,Country Code,Country Name
0,ABW,Aruba
1,AFG,Afghanistan
2,AGO,Angola
3,ALB,Albania
4,AND,Andorra
...,...,...
212,XKX,Kosovo
213,YEM,"Yemen, Rep."
214,ZAF,South Africa
215,ZMB,Zambia


#### Economic Complexity

In [54]:
# ECI
eci_df = (
    pd.read_csv('https://raw.githubusercontent.com/AyaanTigdikar/Capstone/refs/heads/main/rawdata/growth_proj_eci_rankings.csv?token=GHSAT0AAAAAADM6NDBERAR5MWHIKJWQZURE2JF62HQ') # Note: If error, update the raw link in here: https://github.com/AyaanTigdikar/Capstone/blob/main/rawdata/growth_proj_eci_rankings.csv
      .rename(columns={'country_iso3_code': 'country_code', 'eci_hs92': 'eci'})
      .drop(columns=['eci_rank_hs92'])
)

# Country codes
continent_labels = pd.read_csv('../../data/location_group_member.csv')

eci_clean = (
    eci_df
      .merge(
          continent_labels[continent_labels['group_type'] == 'continent'][['group_name', 'country_id']],
          on='country_id',
          how='left'
      )
      .rename(columns={'group_name': 'continent'})
      .drop_duplicates(subset='country_code', keep='first')
)

eci_df['Variable'] = 'Economic Complexity'

eci_df = eci_df.rename(columns={
  'country_code': 'Country Code',
  'year': 'Year',
  'eci': 'Value'
})

eci_df = eci_df[['Country Code', 'Year', 'Variable' , 'Value']]

#### IMF

In [55]:
path, _ = download(2024, "Apr")  # or adjust to whichever vintage you want
w = WEO(path)
frames = []
    
frames = []
for subj, unit in imf_variables:
    df = w.get(subj, unit).reset_index().rename(columns={"index": "COUNTRY"})
    df_long = df.melt(id_vars="COUNTRY", var_name="YEAR", value_name="VALUE")
    df_long["INDICATOR"] = subj
    frames.append(df_long)

imf_df = pd.concat(frames, ignore_index=True)
print(imf_df.head())

Already downloaded 2024-Apr WEO dataset at weo_2024_1.csv
  COUNTRY YEAR  VALUE                                          INDICATOR
0    1980  AFG    NaN  Gross domestic product per capita, constant pr...
1    1981  AFG    NaN  Gross domestic product per capita, constant pr...
2    1982  AFG    NaN  Gross domestic product per capita, constant pr...
3    1983  AFG    NaN  Gross domestic product per capita, constant pr...
4    1984  AFG    NaN  Gross domestic product per capita, constant pr...


In [56]:
imf_df = imf_df.rename(columns={
    'COUNTRY': 'Year',
    'YEAR': 'Country Code',
    'INDICATOR': 'Variable',
    'VALUE': 'Value'
})
imf_df

Unnamed: 0,Year,Country Code,Value,Variable
0,1980,AFG,,"Gross domestic product per capita, constant pr..."
1,1981,AFG,,"Gross domestic product per capita, constant pr..."
2,1982,AFG,,"Gross domestic product per capita, constant pr..."
3,1983,AFG,,"Gross domestic product per capita, constant pr..."
4,1984,AFG,,"Gross domestic product per capita, constant pr..."
...,...,...,...,...
19595,2025,ZWE,16.465,General government revenue
19596,2026,ZWE,16.462,General government revenue
19597,2027,ZWE,16.380,General government revenue
19598,2028,ZWE,16.381,General government revenue


## Merging and final cleaning

In [64]:
final_df = pd.concat([wb_df, eci_df, imf_df])

# Variable names
final_df['Variable'].unique()

rename_map = {
    "NV.IND.MANF.ZS": "Manufacturing",
    "NV.IND.TOTL.ZS": "Industry",
    "TX.VAL.TECH.MF.ZS": "High-tech exports",
    "NV.AGR.TOTL.ZS": "Agriculture",
    "NV.SRV.TOTL.ZS": "Services",
    "Economic Complexity": "Economic Complexity Index",
    "Gross domestic product per capita, constant prices": "GDP per capita (constant prices, PPP)",
    "General government revenue": "Government revenue",
}

final_df["Variable"] = final_df["Variable"].replace(rename_map)

# Countries names
final_df = final_df.merge(country_names, how='left', on='Country Code')

# Filter period
# If Year is a pandas Period, extract the year as int
final_df['Year'] = final_df['Year'].apply(lambda x: x.year if hasattr(x, 'year') else int(x))

final_df = final_df[(final_df['Year'] >= 1990) & (final_df['Year'] <= 2024)]


In [65]:
final_df

Unnamed: 0,Country Code,Year,Variable,Value,Country Name
0,ZWE,2021,NY.GDP.TOTL.RT.ZS,6.398452,Zimbabwe
1,ZWE,2020,NY.GDP.TOTL.RT.ZS,4.746668,Zimbabwe
2,ZWE,2019,NY.GDP.TOTL.RT.ZS,4.715765,Zimbabwe
3,ZWE,2018,NY.GDP.TOTL.RT.ZS,3.378189,Zimbabwe
4,ZWE,2017,NY.GDP.TOTL.RT.ZS,6.095448,Zimbabwe
...,...,...,...,...,...
76130,ZWE,2020,Government revenue,13.275000,Zimbabwe
76131,ZWE,2021,Government revenue,15.348000,Zimbabwe
76132,ZWE,2022,Government revenue,16.599000,Zimbabwe
76133,ZWE,2023,Government revenue,16.456000,Zimbabwe


## Saving data

In [66]:
final_df.to_csv('../workingdata/master_data.csv')