In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from synthete_analysis.constants import CONCENTRATED_IN_OIL, METRICS_FOR_ANALYSIS
from synthete_analysis.retrieval import query_imf
from synthete_analysis.cleanup import normalize_metric_dict, add_all_years

In [6]:
import requests
import numpy as np
import polars as pl
import polars.selectors as cs
from dataclasses import dataclass
from typing import Optional
from functools import cache

In [7]:
all_countries = query_imf("https://www.imf.org/external/datamapper/api/v1/countries")[
    "countries"
]
indicators = query_imf("https://www.imf.org/external/datamapper/api/v1/indicators")
groups = query_imf("https://www.imf.org/external/datamapper/api/v1/groups")["groups"]

In [8]:
# retrieval
gdp_per_capita = query_imf("https://www.imf.org/external/datamapper/api/v1/NGDPDPC")[
    "values"
]["NGDPDPC"]

# cleanup
gdp_per_capita_df = pl.DataFrame(normalize_metric_dict(gdp_per_capita))
gdp_per_capita_df = add_all_years(gdp_per_capita_df)
gdp_per_capita_df = (
    gdp_per_capita_df.filter(
        (pl.col("year") >= 1990)
        & (pl.col("year") <= 2019)
        & (pl.col("country").is_in(all_countries.keys()))
        & (~pl.col("country").is_in(CONCENTRATED_IN_OIL))
    )
    .sort("year")
    .with_columns(pl.col("value").fill_null(strategy="forward").over("country"))
)

# analysis
average_gdp_per_capita_df = (
    gdp_per_capita_df.group_by(pl.col("country"))
    .agg(pl.col("value").mean())
    .sort("value")
    .with_row_index(name="index")
)

In [9]:
# retrieval
real_gdp_per_capita_growth = query_imf(
    "https://www.imf.org/external/datamapper/api/v1/NGDPRPC_PCH"
)["values"]["NGDPRPC_PCH"]

# cleanup
real_gdp_per_capita_growth_df = pl.DataFrame(
    normalize_metric_dict(real_gdp_per_capita_growth)
)
real_gdp_per_capita_growth_df = add_all_years(real_gdp_per_capita_growth_df)
real_gdp_per_capita_growth_df = (
    real_gdp_per_capita_growth_df.filter(
        (pl.col("year") >= 1990)
        & (pl.col("year") <= 2019)
        & (pl.col("country").is_in(all_countries.keys()))
        & (~pl.col("country").is_in(CONCENTRATED_IN_OIL))
    )
    .sort("year")
    .with_columns(pl.col("value").fill_null(strategy="forward").over("country"))
)

# analysis
average_real_gdp_per_capita_growth_df = (
    real_gdp_per_capita_growth_df.group_by(pl.col("country"))
    .agg(pl.col("value").mean())
    .sort("value")
    .with_row_index(name="index")
)
# countries_missing_too_much_gdp_growth_data = set(real_gdp_per_capita_growth_df.group_by("country").agg(
#     pl.col("value").is_null().sum().alias("null_count")
# ).sort("null_count", descending=True).filter(pl.col("null_count") > 10)['country'].to_list())

In [10]:
# missing_too_much_gdp_data = set(gdp_per_capita_df.group_by("country").agg(
#     pl.col("value").is_null().sum().alias("null_count")
# ).sort("null_count", descending=True).filter(pl.col("null_count") > 10)['country'].to_list())
# # missing over 1/3 of the data points
# """
# at the time of this analysis  https://data.imf.org/en/Datasets/WEO/Changes-to-the-Database
# For Montenegro, historical data prior to 2023 for population and GDP per capita are excluded from publication pending the final release of population estimates from the 2023 Census of Population, Households, and Dwellings.
# """

# data cleanup / validation

In [21]:
gdp_per_capita_df.null_count()

country,year,value
u32,u32,u32
0,0,291


In [22]:
gdp_per_capita_df.group_by("country").agg(
    pl.col("value").is_null().sum().alias("null_count")
).filter(pl.col("null_count") > 1).sort(by="null_count", descending=True)

country,null_count
str,u32
"""MNE""",30
"""SOM""",22
"""SSD""",21
"""AND""",20
"""SMR""",14
…,…
"""HRV""",2
"""ARM""",2
"""UZB""",2
"""ERI""",2


# By filtering to 1990 to 2019 null count drops by 1/3

In [23]:
gdp_per_capita_df.filter(
    (pl.col("year") >= 1990) & (pl.col("year") <= 2019)
).null_count()

country,year,value
u32,u32,u32
0,0,291


In [24]:
gdp_per_capita_df = gdp_per_capita_df.filter(
    (pl.col("year") >= 1990) & (pl.col("year") <= 2019)
)

# remove groups and only keep countries

In [25]:
gdp_per_capita_df = gdp_per_capita_df.filter(
    pl.col("country").is_in(all_countries.keys())
)

# as part of the analysis recommends removing concentrated in oil countries

In [26]:
gdp_per_capita_df = gdp_per_capita_df.filter(
    ~pl.col("country").is_in(CONCENTRATED_IN_OIL)
)

# using fill forward for null values

In [27]:
gdp_per_capita_df = gdp_per_capita_df.sort("year").with_columns(
    pl.col("value").fill_null(strategy="forward").over("country")
)

In [28]:
gdp_per_capita_df.null_count()

country,year,value
u32,u32,u32
0,0,291


In [29]:
gdp_per_capita_df.group_by("country").agg(
    pl.col("value").is_null().sum().alias("null_count")
).filter(pl.col("null_count") > 1).sort(by="null_count", descending=True)

country,null_count
str,u32
"""MNE""",30
"""SOM""",22
"""SSD""",21
"""AND""",20
"""SMR""",14
…,…
"""TJK""",2
"""ARM""",2
"""MDA""",2
"""LVA""",2


# countries missing too much data

In [30]:
# gdp_per_capita_df = gdp_per_capita_df.filter(~pl.col("country").is_in(missing_too_much_data))

In [31]:
gdp_per_capita_df.null_count()

country,year,value
u32,u32,u32
0,0,291


# how will remaining nulls be handled?

In [34]:
real_gdp_per_capita_growth = query_imf(
    "https://www.imf.org/external/datamapper/api/v1/NGDPRPC_PCH"
)["values"]["NGDPRPC_PCH"]
real_gdp_per_capita_growth_df = pl.DataFrame(
    normalize_metric_dict(real_gdp_per_capita_growth)
)

In [35]:
real_gdp_per_capita_growth_df

country,year,value
str,i64,f64
"""AGO""",2004,7.580882
"""AGO""",2005,10.160323
"""AGO""",2006,7.893987
"""AGO""",2007,8.961208
"""AGO""",2008,6.796743
…,…,…
"""ZWE""",2022,4.012554
"""ZWE""",2023,2.832781
"""ZWE""",2024,-0.18579
"""ZWE""",2025,3.777606


In [37]:
real_gdp_per_capita_growth_df

country,year,value
str,i64,f64
"""AGO""",2004,7.580882
"""AGO""",2005,10.160323
"""AGO""",2006,7.893987
"""AGO""",2007,8.961208
"""AGO""",2008,6.796743
…,…,…
"""ZWE""",2022,4.012554
"""ZWE""",2023,2.832781
"""ZWE""",2024,-0.18579
"""ZWE""",2025,3.777606


Summary of the Steps as an Algorithm:

    Calculate each country's average GDP per capita (1990-2019).

    For each country:

        Find 20 GDP neighbors above, 20 GDP neighbors below.

        From each side, select 5 highest-growth countries (1990-2019 real GDP per capita growth).

        Drop the highest-growth country from both sides to reduce outliers.

        Remaining 8 countries = synthete peers.

    Compute for these peers:

        Average primary balance → Synthete Central Value.

        2nd highest primary balance → Upper Band.

        2nd lowest primary balance → Lower Band.

    Repeat globally for all countries.

    Aggregate the synthete bands across GDP per capita levels.

This process is detailed explicitly in:

    Section V (i) “Synthete construction”

Further detailed in the Jamaica example and global aggregation
.

In [39]:
target_row = sorted_gdp.filter(pl.col("country") == "USA")
target_index = target_row.select("index").item()
lower_neighbors = sorted_gdp.filter((pl.col("index") < target_index)).tail(20)
upper_neighbors = sorted_gdp.filter((pl.col("index") > target_index)).head(20)

# missing over 1/3 of the data points

NameError: name 'sorted_gdp' is not defined

In [None]:
import polars as pl

# Example DataFrames
gdp_df = pl.DataFrame(
    {
        "country": [
            "GIN",
            "BFA",
            "SYC",
            "AUS",
            "SVN",
            "LUX",
            "LSO",
            "MEX",
            "KGZ",
            "ITA",
        ],
        "value": [
            630.88,
            482.55,
            10565.74,
            37592.28,
            17652.11,
            79613.13,
            817.79,
            7950.16,
            705.17,
            28729.07,
        ],
    }
)

growth_df = pl.DataFrame(
    {
        "country": [
            "GIN",
            "BFA",
            "SYC",
            "AUS",
            "SVN",
            "LUX",
            "LSO",
            "MEX",
            "KGZ",
            "ITA",
        ],
        "growth": [2.1, 3.0, 4.5, 1.8, 2.9, 1.5, 2.3, 2.7, 3.5, 1.9],
    }
)

# Sort GDP per capita and assign index
sorted_gdp = gdp_df.sort("value").with_row_count(name="index")


# Function to get synthete peers for a country
def get_synthete_peers(target_country):
    target_row = sorted_gdp.filter(pl.col("country") == target_country)
    if target_row.is_empty():
        return []

    target_index = target_row.select("index").item()

    # Get 20 below and 20 above (handling edge cases with slice)
    lower_neighbors = sorted_gdp.filter((pl.col("index") < target_index)).tail(20)
    upper_neighbors = sorted_gdp.filter((pl.col("index") > target_index)).head(20)

    # Join with growth data
    lower_joined = lower_neighbors.join(growth_df, on="country")
    upper_joined = upper_neighbors.join(growth_df, on="country")

    # Select top 5 by growth, drop the top 1
    lower_best = lower_joined.sort("growth", descending=True).slice(
        1, 4
    )  # 5-1 = 4 remaining
    upper_best = upper_joined.sort("growth", descending=True).slice(1, 4)

    synthete_peers = (
        lower_best.select("country").to_series().to_list()
        + upper_best.select("country").to_series().to_list()
    )
    return synthete_peers


# Example: Get synthete peers for "SVN"
peers = get_synthete_peers("SVN")
print(peers)