In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import requests
import numpy as np
import polars as pl
import polars.selectors as cs
from dataclasses import dataclass
from typing import Optional
from functools import cache

from synthete_analysis.constants import CONCENTRATED_IN_OIL, METRICS_FOR_ANALYSIS
from synthete_analysis.retrieval import query_imf
from synthete_analysis.cleanup import normalize_metric_dict, add_all_years

In [4]:
all_countries = query_imf("https://www.imf.org/external/datamapper/api/v1/countries")[
    "countries"
]
indicators = query_imf("https://www.imf.org/external/datamapper/api/v1/indicators")['indicators']
groups = query_imf("https://www.imf.org/external/datamapper/api/v1/groups")["groups"]

In [115]:
all_countries

{'ABW': {'label': 'Aruba'},
 'AFG': {'label': 'Afghanistan'},
 'AGO': {'label': 'Angola'},
 'AIA': {'label': 'Anguilla'},
 'ALB': {'label': 'Albania'},
 'ARE': {'label': 'United Arab Emirates'},
 'ARG': {'label': 'Argentina'},
 'ARM': {'label': 'Armenia'},
 'ASM': {'label': 'American Samoa'},
 'ATG': {'label': 'Antigua and Barbuda'},
 'AUS': {'label': 'Australia'},
 'AUT': {'label': 'Austria'},
 'AZE': {'label': 'Azerbaijan'},
 'BDI': {'label': 'Burundi'},
 'BEL': {'label': 'Belgium'},
 'BEN': {'label': 'Benin'},
 'BFA': {'label': 'Burkina Faso'},
 'BGD': {'label': 'Bangladesh'},
 'BGR': {'label': 'Bulgaria'},
 'BHR': {'label': 'Bahrain'},
 'BHS': {'label': 'Bahamas, The'},
 'BIH': {'label': 'Bosnia and Herzegovina'},
 'BLR': {'label': 'Belarus'},
 'BLZ': {'label': 'Belize'},
 'BMU': {'label': 'Bermuda'},
 'BOL': {'label': 'Bolivia'},
 'BRA': {'label': 'Brazil'},
 'BRB': {'label': 'Barbados'},
 'BRN': {'label': 'Brunei Darussalam'},
 'BTN': {'label': 'Bhutan'},
 'BWA': {'label': 'Botsw

In [95]:
indicators

{'NGDP_RPCH': {'label': 'Real GDP growth',
  'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at constant prices of final goods and services produced within a country during a specified time period, such as one year.",
  'source': 'World Economic Outlook (April 2025)',
  'unit': 'Annual percent change',
  'dataset': 'WEO'},
 'NGDPD': {'label': 'GDP, current prices',
  'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at current prices of final goods and services produced within a country during a specified time period, such as one year.",
  'source': 'World Economic Outlook (April 2025)',
  'unit': 'Billions of U.S. dollars',
  'dataset': 'WEO'},
 'NGDPDPC': {'label': 'GDP per capita, current prices\n',
  'description': "Gross domestic product is the most commonly used single measure 

In [11]:
indicators['PPPPC']

{'label': 'GDP per capita, current prices',
 'description': 'Gross domestic product (GDP) is the most commonly used single measure of a country\'s overall economic activity. Per capita GDP represents the total value in PPP terms of final goods and services produced within a country during a specified time period divided by the average population for the same one year.\n\nPurchasing Power Parity (PPP) is a theory which relates changes in the nominal exchange rate between two countries currencies to changes in the countries\' price levels. More information on PPP methodology can be found on the World Economic Outlook FAQ - <a href="http://www.imf.org/external/pubs/ft/weo/faq.htm#q4d" target="new">click here</a>',
 'source': 'World Economic Outlook (April 2025)',
 'unit': 'Purchasing power parity; international dollars per capita',
 'dataset': 'WEO'}

# Justification for using PPPPC instead of NGDPDPC
should I use PPPPC instead of NGDPDPC for step Calculate each country's average GDP per capita (1990-2019).?

This is a fantastic and very important question. The choice between NGDPDPC and PPPPC for the initial sorting step fundamentally changes the nature of the "similarity" you are measuring for your peers.

There is no single "right" answer, but there is a more conceptually appropriate choice based on the goal of the synthetic peer methodology. Let's break down the implications of each option.
Option 1: Use NGDPDPC (GDP per capita, current US$)

    What it measures: The average market-exchange rate converted income of a country. It reflects the country's ability to engage with the global market (buy imported goods, service foreign debt, etc.).

    Pros:

        Relevance for fiscal policy: Government debt is often denominated in foreign currency (e.g., US dollars). The government's capacity to earn foreign currency through exports and taxes to service this debt is better reflected by NGDPDPC.

        Reflects global market integration: It captures a country's position in the global economic hierarchy as defined by market forces.

    Cons:

        Volatile: Highly sensitive to exchange rate fluctuations, which can be dramatic and not necessarily tied to real economic performance.

        Doesn't reflect domestic living costs: A country with a low cost of living might have a lower NGDPDPC but a relatively higher standard of living than the number suggests.

Option 2: Use PPPPC (GDP per capita, PPP)

    What it measures: The average purchasing power of income within a country's domestic economy. It adjusts for differences in the cost of living and is a better measure of the actual standard of living and domestic economic structure.

    Pros:

        Better measure of domestic economy: It more accurately reflects the size and structure of the domestic tax base, which is crucial for fiscal capacity.

        Stable: Not affected by exchange rate volatility, giving a smoother, more stable measure of economic level over time.

        Reflects real economic size: China and India have much larger economies measured at PPP than at market rates.

    Cons:

        Less relevant for external debt: A government's ability to service dollar-denominated debt is not directly related to its PPP-adjusted GDP.

Recommendation: Which One to Use?

The paper's methodology is designed to find countries that are structurally similar but have achieved better performance.

For this goal, using PPPPC for the initial sorting is the stronger and more conceptually robust choice.

Why?

    Structural Similarity: The core idea of "similarity" should be based on the internal structure and development of the economy—the productivity of its workers, the sophistication of its industries, and the size of its domestic market. PPPPC is a far superior measure for this. It filters out the noise of exchange rates and focuses on the real economic engine.

    Policy Lessons: The lessons you learn from synthetic peers are likely about domestic policy—institutions, education, infrastructure, industrial policy. These affect the real, productive capacity of the economy, which is best captured by PPP measures.

    Consistency with Growth Metric: You are already using PPPPC to measure performance. Using it for the initial level ensures you are comparing "apples to apples." You are finding countries that started with a similar real economic level and then seeing which ones achieved the best real growth.

Conclusion:

While using NGDPDPC is defensible (especially if your analysis is specifically about external debt sustainability), for the general purpose of identifying synthetic peers based on economic structure and performance, you should use PPPPC for both steps.

Final Algorithm Recommendation:

    Calculate Average Level (for Sorting):

        Use PPPPC (Average 1990-2019)

    Find Neighborhood:

        Sort countries by average PPPPC. Select 20 above and 20 below.

    Calculate Performance (for Ranking):

        Use PPPPC to calculate growth: (PPPPC_2019 / PPPPC_1990) - 1

This approach ensures your peer groups are formed based on real economic similarity and ranked by real economic performance, which is the most meaningful comparison for policy analysis.

In [171]:
# retrieval
METRIC = "PPPPC"
gdp_per_capita = query_imf(f"https://www.imf.org/external/datamapper/api/v1/{METRIC}")[
    "values"
][f"{METRIC}"]

# cleanup
gdp_per_capita_df = pl.DataFrame(normalize_metric_dict(gdp_per_capita))
gdp_per_capita_df = add_all_years(gdp_per_capita_df)
gdp_per_capita_df = (
    gdp_per_capita_df.filter(
        (pl.col("year") >= 1990)
        & (pl.col("year") <= 2019)
        & (pl.col("country").is_in(all_countries.keys()))
        & (~pl.col("country").is_in(CONCENTRATED_IN_OIL))
    )
    .sort("year")
    .with_columns(pl.col("value").fill_null(strategy="forward").over("country"))
)
# fill forward produces results 4/8 compared to papers, backwards 3/8

# analysis
average_gdp_per_capita_df = (
    gdp_per_capita_df.group_by(pl.col("country"))
    .agg(pl.col("value").mean()).rename({"value": "average_gdp_per_capita"})
)
# important that index is added last 

In [172]:
gdp_per_capita['ZWE']

{'1998': 2812.986,
 '1999': 2828.804,
 '2000': 2777.836,
 '2001': 2834.765,
 '2002': 2664.11,
 '2003': 2274.309,
 '2004': 2172.226,
 '2005': 2056.724,
 '2006': 2014.11,
 '2007': 1993.349,
 '2008': 1688.888,
 '2009': 1808.571,
 '2010': 2176.304,
 '2011': 2512.216,
 '2012': 2847.536,
 '2013': 2874.793,
 '2014': 2919.43,
 '2015': 2937.466,
 '2016': 2926.183,
 '2017': 3070.167,
 '2018': 2838.724,
 '2019': 3251.507,
 '2020': 3499.647,
 '2021': 4182.605,
 '2022': 4660.594,
 '2023': 4965.087,
 '2024': 5075.699,
 '2025': 5407.359,
 '2026': 5661.688,
 '2027': 5851.14,
 '2028': 6051.366,
 '2029': 6246.941,
 '2030': 6450.939}

In [173]:
gdp_per_capita_df.filter(pl.col("country") == "UVK")

country,year,value
str,i64,f64
"""UVK""",1990,
"""UVK""",1991,
"""UVK""",1992,
"""UVK""",1993,
"""UVK""",1994,
…,…,…
"""UVK""",2015,9638.614
"""UVK""",2016,10346.144
"""UVK""",2017,10976.415
"""UVK""",2018,11724.238


In [174]:
# analysis 
real_gdp_per_capita_growth_df = (
    gdp_per_capita_df.filter(pl.col("year").is_between(1990, 2019))
    .group_by("country")
    .agg([
        # Get first and last non-null values in the period
        pl.col("value").filter(pl.col("year") == 1990).first().alias("start_value"),
        pl.col("value").filter(pl.col("year") == 2019).first().alias("end_value"),
        # Count available years to ensure sufficient data
        pl.col("value").filter(pl.col("value").is_not_null()).count().alias("valid_years")
    ])
    .filter(
        (pl.col("start_value").is_not_null()) & 
        (pl.col("end_value").is_not_null()) &
        (pl.col("start_value") > 0) &  # Avoid division by zero
        (pl.col("valid_years") >= 2)   # At least 2 data points
    )
    .with_columns([
        # Calculate CAGR: (end/start)^(1/years) - 1
        ((pl.col("end_value") / pl.col("start_value")) ** (1/(2019-1990)) - 1).alias("cagr")
    ])
    .select(["country", "cagr"])
)

In [175]:
all_data_df = average_gdp_per_capita_df.join(real_gdp_per_capita_growth_df, on="country")

In [176]:
all_data_df

country,average_gdp_per_capita,cagr
str,f64,f64
"""JOR""",7714.044433,0.023217
"""GTM""",6833.832233,0.035338
"""PRY""",8922.0549,0.033595
"""COM""",2487.9221,0.019427
"""STP""",2435.087367,0.038656
…,…,…
"""MAR""",5143.9457,0.04076
"""BGR""",13143.2887,0.034914
"""BWA""",10242.743033,0.031811
"""SWE""",35928.8195,0.036301


In [179]:
def get_synthete_peers(target_country, df):
    df = df.sort("average_gdp_per_capita").with_row_index(name="index") # row index used to get top 20 and bottom 20
    target_row = df.filter(pl.col("country") == target_country)
    if target_row.is_empty():
        return []

    target_index = target_row.select("index").item()

    # Get 20 below and 20 above (handling edge cases with slice)
    lower_neighbors = df.filter((pl.col("index") < target_index)).tail(20)
    upper_neighbors = df.filter((pl.col("index") > target_index)).head(20)

    # # Select top 5 by growth, drop the top 1
    # lower5 = lower_neighbors.sort("cagr", descending=True).head(5)
    # upper5 = upper_neighbors.sort("cagr", descending=True).head(5)

    #     # Drop the *single* fastest grower (outlier) in each half
    # lower_best = lower5.sort("cagr", descending=True).tail(4)
    # upper_best = upper5.sort("cagr", descending=True).tail(4)
    
    lower_best = lower_neighbors.sort("cagr", descending=True).slice(
        1, 4
    )  # 5-1 = 4 remaining
    upper_best = upper_neighbors.sort("cagr", descending=True).slice(1, 4)

    synthete_peers = (
        lower_best.select("country").to_series().to_list()
        + upper_best.select("country").to_series().to_list()
    )
    return synthete_peers

In [180]:
# Example: Get synthete peers for "SVN"
peers = get_synthete_peers("JAM",all_data_df)
for country in peers:
    print(all_countries[country])

{'label': 'Vietnam'}
{'label': 'Bhutan'}
{'label': 'Sri Lanka'}
{'label': 'Cabo Verde'}
{'label': 'Dominican Republic'}
{'label': 'Mauritius'}
{'label': 'Thailand'}
{'label': 'Lebanon'}


In [None]:
{'label': 'Vietnam'}
{'label': 'Bhutan'}
{'label': 'Sri Lanka'}
{'label': 'Cabo Verde'}
{'label': 'Dominican Republic'}
{'label': 'Mauritius'}
{'label': 'Thailand'}
{'label': 'Lebanon'}

In [76]:
all_data_df = all_data_df.sort("average_gdp_per_capita").with_row_index(name="index")

In [126]:
all_data_df

country,average_gdp_per_capita,cagr
str,f64,f64
"""STP""",2435.087367,0.038656
"""SEN""",2260.2798,0.031569
"""BRB""",14035.807833,0.022815
"""MAR""",5143.9457,0.04076
"""PER""",7463.932433,0.052029
…,…,…
"""ZMB""",2218.653233,0.027904
"""SLV""",5998.6574,0.039364
"""ESP""",28854.690667,0.035053
"""SDN""",3404.581,0.03564


In [159]:
all_data_df.filter(pl.col("country") == "SRB")

country,average_gdp_per_capita,cagr
str,f64,f64
"""SRB""",11176.511567,0.040004


In [139]:
"KOS" in all_countries.keys()

True

# you're here SRB and XKX missing from all_data_df.  its missing because those countries dont have 1990 data and we're filling forward.  author does some kind of patch for missing data might want to explore that also 4/8 is pretty close

In [100]:
target_row = all_data_df.filter(pl.col("country") == "JAM")

In [101]:
target_row

country,average_gdp_per_capita,cagr
str,f64,f64
"""JAM""",4023.807567,0.037794


In [79]:
target_index = target_row.select("index").item()

In [80]:
target_index

65

In [81]:
lower_neighbors = all_data_df.filter((pl.col("index") < target_index)).tail(20)

In [82]:
lower_neighbors

index,country,average_gdp_per_capita,cagr
u32,str,f64,f64
45,"""VNM""",4630.4988,0.078499
46,"""CPV""",4877.636233,0.057361
47,"""SYR""",5125.4937,0.027982
48,"""MAR""",5143.9457,0.04076
49,"""BOL""",5332.1143,0.041861
…,…,…,…
60,"""BLZ""",7237.5579,0.036392
61,"""PER""",7463.932433,0.052029
62,"""GUY""",7492.5202,0.053604
63,"""TUN""",7605.971433,0.045515


In [83]:
upper_neighbors = all_data_df.filter((pl.col("index") > target_index)).head(20)

In [84]:
upper_neighbors

index,country,average_gdp_per_capita,cagr
u32,str,f64,f64
66,"""EGY""",8257.881633,0.043039
67,"""DMA""",8543.763,0.040641
68,"""FJI""",8741.7601,0.034609
69,"""VCT""",8777.4392,0.043863
70,"""PRY""",8922.0549,0.033595
…,…,…,…
81,"""BGR""",13143.2887,0.034914
82,"""BRB""",14035.807833,0.022815
83,"""MDV""",14090.2214,0.06436
84,"""URY""",14153.466633,0.046285


In [85]:
lower_best = lower_neighbors.sort("cagr", descending=True).slice(
        1, 4
    )  # 5-1 = 

In [86]:
lower_best

index,country,average_gdp_per_capita,cagr
u32,str,f64,f64
45,"""VNM""",4630.4988,0.078499
50,"""BTN""",5607.724567,0.070506
57,"""LKA""",6927.5847,0.065782
46,"""CPV""",4877.636233,0.057361


In [87]:
 upper_best = upper_neighbors.sort("cagr", descending=True).slice(1, 4)

In [88]:
upper_best

index,country,average_gdp_per_capita,cagr
u32,str,f64,f64
72,"""DOM""",9830.0648,0.060628
80,"""MUS""",12810.1491,0.055486
77,"""THA""",10910.088933,0.055249
85,"""LBN""",14486.09,0.051348
