# **Cleaning master data**

Here I create the master dataset using World Bank APIs, FMI and Economic Complexity data.

In [119]:
import wbgapi as wb # World bank
import pandas as pd
from weo import download, WEO # IMF data
import requests

### World Bank API

In [120]:
wb_variables = [
    # GDP/GNI
#    'NY.GDP.PCAP.CD', # GDP per capita (current US$)
#    'NY.GNP.ATLS.CD', # GNI, Atlas method (current US$)

    # Resource Intensity
    'NY.GDP.TOTL.RT.ZS', # Total natural resources rents (% of GDP)

    # Manufacturing & Industrial Output0
    'NV.IND.MANF.ZS',        # Manufacturing, value added (% of GDP)
    'NV.IND.TOTL.ZS',        # Industry (including construction), value added (% of GDP)
 #  'TX.VAL.MANF.ZS.UN',     # Manufactures exports (% of merchandise exports)
    'TX.VAL.TECH.MF.ZS',     # High-technology exports (% of manufactured exports)
    'NV.AGR.TOTL.ZS',        # Agriculture, forestry, and fishing, value added (% of GDP)
    'NV.SRV.TOTL.ZS',        # Services, value added (% of GDP)

    # Employment
#    'SL.IND.EMPL.ZS',       # Employment in industry (% of total employment)
#    'SL.SRV.EMPL.ZS',       # Employment in services (% of total employment)
#    'SL.AGR.EMPL.ZS',       # Employment in agriculture (% of total)
    
    # Trade Sophistication
#    'NE.TRD.GNFS.ZS',       # Trade (% of GDP)
    
#    'TX.VAL.FUEL.ZS.UN', # Fuel exports (% of merchandise exports)
#    'TX.VAL.MMTL.ZS.UN', # Ores and metals exports (% of merchandise exports)
]

In [121]:
def download_wb_indicators(indicators, start_year, end_year):
    final_rows = []

    # Get all real countries
    economies = [c['id'] for c in wb.economy.list() if not c.get("aggregate", False)]

    for indicator in indicators:
        print(f"Downloading {indicator} ...")
        raw = wb.data.fetch(indicator, economy=economies, time=range(start_year, end_year + 1))
        
        for row in raw:
            iso = row.get("economy")
            year = int(row.get("time").replace("YR", ""))
            value = row.get("value")
            
            if iso is None or value is None:
                continue

            final_rows.append({
                "country": iso,
                "year": year,
                "indicator": indicator,
                "value": value
            })

    df = pd.DataFrame(final_rows)
    return df

In [122]:
wb_df = download_wb_indicators(wb_variables, start_year=1990, end_year=2024)

Downloading NY.GDP.TOTL.RT.ZS ...
Downloading NV.IND.MANF.ZS ...
Downloading NV.IND.TOTL.ZS ...
Downloading TX.VAL.TECH.MF.ZS ...
Downloading NV.AGR.TOTL.ZS ...
Downloading NV.SRV.TOTL.ZS ...


In [123]:
wb_df = wb_df.rename(columns={
    'country': 'Country Code',
    'indicator': 'Variable',
    'year': 'Year',
    'value': 'Value'
}
                     )
wb_df

Unnamed: 0,Country Code,Year,Variable,Value
0,ZWE,2021,NY.GDP.TOTL.RT.ZS,6.398452
1,ZWE,2020,NY.GDP.TOTL.RT.ZS,4.746668
2,ZWE,2019,NY.GDP.TOTL.RT.ZS,4.715765
3,ZWE,2018,NY.GDP.TOTL.RT.ZS,3.378189
4,ZWE,2017,NY.GDP.TOTL.RT.ZS,6.095448
...,...,...,...,...
33802,ABW,1999,NV.SRV.TOTL.ZS,80.855383
33803,ABW,1998,NV.SRV.TOTL.ZS,79.465951
33804,ABW,1997,NV.SRV.TOTL.ZS,79.588257
33805,ABW,1996,NV.SRV.TOTL.ZS,80.975304


#### Economic Complexity

In [124]:
# ECI
eci_df = (
    pd.read_csv('../../data/growth_proj_eci_rankings.csv')
      .rename(columns={'country_iso3_code': 'country_code', 'eci_hs92': 'eci'})
      .drop(columns=['eci_rank_hs92'])
)
# Country codes
continent_labels = pd.read_csv('../../data/location_group_member.csv')

eci_clean = (
    eci_df
      .merge(
          continent_labels[continent_labels['group_type'] == 'continent'][['group_name', 'country_id']],
          on='country_id',
          how='left'
      )
      .rename(columns={'group_name': 'continent'})
      .drop_duplicates(subset='country_code', keep='first')
)

eci_df['Variable'] = 'Economic Complexity'

eci_df = eci_df.rename(columns={
  'country_code': 'Country Code',
  'year': 'Year',
  'eci': 'Value'
})

eci_df = eci_df[['Country Code', 'Year', 'Variable' , 'Value']]

eci_df

Unnamed: 0,Country Code,Year,Variable,Value
0,AFG,1995,Economic Complexity,-0.451
1,AFG,1996,Economic Complexity,-0.477
2,AFG,1997,Economic Complexity,0.063
3,AFG,1998,Economic Complexity,-0.278
4,AFG,1999,Economic Complexity,-0.249
...,...,...,...,...
4174,TWN,2019,Economic Complexity,2.098
4175,TWN,2020,Economic Complexity,2.172
4176,TWN,2021,Economic Complexity,2.031
4177,TWN,2022,Economic Complexity,2.043


#### IMF

In [125]:
subjects = [
    ("Gross domestic product per capita, constant prices", "Purchasing power parity; 2017 international dollar"),
    ("General government revenue", "Percent of GDP"),
]

path, _ = download(2024, "Apr")  # or adjust to whichever vintage you want
w = WEO(path)
frames = []
    
frames = []
for subj, unit in subjects:
    df = w.get(subj, unit).reset_index().rename(columns={"index": "COUNTRY"})
    df_long = df.melt(id_vars="COUNTRY", var_name="YEAR", value_name="VALUE")
    df_long["INDICATOR"] = subj
    frames.append(df_long)

imf_df = pd.concat(frames, ignore_index=True)
print(imf_df.head())

Already downloaded 2024-Apr WEO dataset at weo_2024_1.csv
  COUNTRY YEAR  VALUE                                          INDICATOR
0    1980  AFG    NaN  Gross domestic product per capita, constant pr...
1    1981  AFG    NaN  Gross domestic product per capita, constant pr...
2    1982  AFG    NaN  Gross domestic product per capita, constant pr...
3    1983  AFG    NaN  Gross domestic product per capita, constant pr...
4    1984  AFG    NaN  Gross domestic product per capita, constant pr...


In [126]:
imf_df = imf_df.rename(columns={
    'COUNTRY': 'Year',
    'YEAR': 'Country Code',
    'INDICATOR': 'Variable',
    'VALUE': 'Value'
})
imf_df

Unnamed: 0,Year,Country Code,Value,Variable
0,1980,AFG,,"Gross domestic product per capita, constant pr..."
1,1981,AFG,,"Gross domestic product per capita, constant pr..."
2,1982,AFG,,"Gross domestic product per capita, constant pr..."
3,1983,AFG,,"Gross domestic product per capita, constant pr..."
4,1984,AFG,,"Gross domestic product per capita, constant pr..."
...,...,...,...,...
19595,2025,ZWE,16.465,General government revenue
19596,2026,ZWE,16.462,General government revenue
19597,2027,ZWE,16.380,General government revenue
19598,2028,ZWE,16.381,General government revenue


## Merging and final cleaning

In [127]:
final_df = pd.concat([wb_df, eci_df, imf_df])

# Renaming variable codes
final_df['Variable'].unique()

rename_map = {
    "NV.IND.MANF.ZS": "Manufacturing",
    "NV.IND.TOTL.ZS": "Industry",
    "TX.VAL.TECH.MF.ZS": "High-tech exports",
    "NV.AGR.TOTL.ZS": "Agriculture",
    "NV.SRV.TOTL.ZS": "Services",
    "Economic Complexity": "Economic Complexity Index",
    "Gross domestic product per capita, constant prices": "GDP per capita (constant prices, PPP)",
    "General government revenue": "Government revenue",
}

final_df["Variable"] = final_df["Variable"].replace(rename_map)

final_df['Variable'].unique()


array(['NY.GDP.TOTL.RT.ZS', 'Manufacturing', 'Industry',
       'High-tech exports', 'Agriculture', 'Services',
       'Economic Complexity Index',
       'GDP per capita (constant prices, PPP)', 'Government revenue'],
      dtype=object)

## Saving data

In [None]:
final_df.to_csv('../../data/master_data.csv')