Pipeline to determine candidates for further analysis based on GDP in years 2014-2024

In [35]:
!pip install polars




[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
from pathlib import Path
import polars as pl

PROJECT_ROOT = Path(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data"

gdp_data_raw = pl.read_csv(DATA_DIR / "gdp.csv")
print(gdp_data_raw)

shape: (217, 70)
┌────────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬──────┐
│ Country    ┆ Country    ┆ Indicator  ┆ Indicator  ┆ … ┆ 2022      ┆ 2023      ┆ 2024      ┆      │
│ Name       ┆ Code       ┆ Name       ┆ Code       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---  │
│ ---        ┆ ---        ┆ ---        ┆ ---        ┆   ┆ str       ┆ str       ┆ str       ┆ str  │
│ str        ┆ str        ┆ str        ┆ str        ┆   ┆           ┆           ┆           ┆      │
╞════════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪══════╡
│ Aruba      ┆ ABW        ┆ GDP        ┆ NY.GDP.MKT ┆ … ┆ 332403444 ┆ 383472961 ┆ 426565067 ┆ null │
│            ┆            ┆ (current   ┆ P.CD       ┆   ┆ 3.26581   ┆ 6.12027   ┆ 3.00236   ┆      │
│            ┆            ┆ US$)       ┆            ┆   ┆           ┆           ┆           ┆      │
│ Afghanista ┆ AFG        ┆ GDP        ┆ NY.GDP.MKT ┆ … ┆ 144972438 ┆ 1715

In [37]:
analysis_years = [str(year) for year in range(2014, 2025)]

headers = ["Country Name", "Country Code"] + analysis_years

gdp_in_time_range = (
    gdp_data_raw.select(headers)
)

print(gdp_in_time_range)

shape: (217, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Country   ┆ Country   ┆ 2014      ┆ 2015      ┆ … ┆ 2021      ┆ 2022      ┆ 2023      ┆ 2024     │
│ Name      ┆ Code      ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ ---       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str      │
│ str       ┆ str       ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Aruba     ┆ ABW       ┆ 279084972 ┆ 296290670 ┆ … ┆ 288090279 ┆ 332403444 ┆ 383472961 ┆ 42656506 │
│           ┆           ┆ 0.67039   ┆ 3.91061   ┆   ┆ 8.92288   ┆ 3.26581   ┆ 6.12027   ┆ 73.00236 │
│ Afghanist ┆ AFG       ┆ 204971285 ┆ 191342216 ┆ … ┆ 142599954 ┆ 144972438 ┆ 171522346 ┆          │
│ an        ┆           ┆ 55.6972   ┆ 44.7325   ┆   ┆ 41.0759   ┆ 72.1337 

In [38]:
EU27_2020_codes = [
    'AUT',  # Austria
    'BEL',  # Belgium
    'BGR',  # Bulgaria
    'HRV',  # Croatia
    'CYP',  # Cyprus
    'CZE',  # Czechia
    'DNK',  # Denmark
    'EST',  # Estonia
    'FIN',  # Finland
    'FRA',  # France
    'DEU',  # Germany
    'GRC',  # Greece
    'HUN',  # Hungary
    'IRL',  # Ireland
    'ITA',  # Italy
    'LVA',  # Latvia
    'LTU',  # Lithuania
    'LUX',  # Luxembourg
    'MLT',  # Malta
    'NLD',  # Netherlands
    'POL',  # Poland
    'PRT',  # Portugal
    'ROU',  # Romania
    'SVK',  # Slovakia
    'SVN',  # Slovenia
    'ESP',  # Spain
    'SWE'   # Sweden
]

gdp_without_eu_2020 = (
    gdp_in_time_range.remove(pl.col("Country Code").is_in(EU27_2020_codes))
)

print(gdp_without_eu_2020)

shape: (190, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Country   ┆ Country   ┆ 2014      ┆ 2015      ┆ … ┆ 2021      ┆ 2022      ┆ 2023      ┆ 2024     │
│ Name      ┆ Code      ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ ---       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str      │
│ str       ┆ str       ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Aruba     ┆ ABW       ┆ 279084972 ┆ 296290670 ┆ … ┆ 288090279 ┆ 332403444 ┆ 383472961 ┆ 42656506 │
│           ┆           ┆ 0.67039   ┆ 3.91061   ┆   ┆ 8.92288   ┆ 3.26581   ┆ 6.12027   ┆ 73.00236 │
│ Afghanist ┆ AFG       ┆ 204971285 ┆ 191342216 ┆ … ┆ 142599954 ┆ 144972438 ┆ 171522346 ┆          │
│ an        ┆           ┆ 55.6972   ┆ 44.7325   ┆   ┆ 41.0759   ┆ 72.1337 

In [63]:
top_25_economies_ex_eu27_2014_24 = {}

for year in analysis_years:
    top_25_gdp_values_for_year_ex_eu27 = (
        gdp_without_eu_2020
        .select(
            pl.col("Country Code"),
            pl.col("Country Name"),
            pl.col(year).cast(pl.Float64, strict=False).alias("GDP Value"),
        )
        .drop_nulls(pl.col("GDP Value"))
        .sort("GDP Value", descending=True)
        .head(30)
    )

    top_25_economies_ex_eu27_2014_24[year] = top_25_gdp_values_for_year_ex_eu27.select(pl.col("Country Code"))

unique_country_codes = (
    pl.concat(top_25_economies_ex_eu27_2014_24.values())
    .unique()
    ["Country Code"]
    .to_list()
)
print(len(unique_country_codes), unique_country_codes)

country_counts = (
    pl.concat(top_25_economies_ex_eu27_2014_24.values())
    .group_by("Country Code")
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Convert to list of tuples
result = country_counts.rows()
print(result)

32 ['AUS', 'ZAF', 'MYS', 'USA', 'CHN', 'BRA', 'HKG', 'CAN', 'JPN', 'CHE', 'KOR', 'MEX', 'THA', 'IDN', 'PAK', 'ARE', 'ISR', 'NGA', 'RUS', 'SGP', 'EGY', 'BGD', 'COL', 'GBR', 'TUR', 'NOR', 'IRN', 'ARG', 'VNM', 'IND', 'PHL', 'SAU']
[('GBR', 11), ('NOR', 11), ('PHL', 11), ('SAU', 11), ('KOR', 11), ('IDN', 11), ('CHN', 11), ('CHE', 11), ('IND', 11), ('MEX', 11), ('RUS', 11), ('TUR', 11), ('ZAF', 11), ('ARE', 11), ('JPN', 11), ('AUS', 11), ('CAN', 11), ('ARG', 11), ('MYS', 11), ('ISR', 11), ('SGP', 11), ('THA', 11), ('BRA', 11), ('USA', 11), ('NGA', 10), ('HKG', 10), ('IRN', 10), ('EGY', 8), ('BGD', 8), ('PAK', 7), ('COL', 7), ('VNM', 6)]
