In [8]:
import requests, pandas as pd, numpy as np
from sklearn.cluster import KMeans

In [2]:
url = "https://api.worldbank.org/v2/country/all/indicator/NY.GDP.MKTP.CD?mrnev=1&format=json&per_page=20000"
meta, data = requests.get(url).json()

rows = []
for d in data:
    c = d.get("country", {})
    if c.get("value") and d.get("value") is not None:
        rows.append({
            "Country": c["value"],
            "Year": int(d["date"]),
            "GDP (current US$)": d["value"]
        })

df = pd.DataFrame(rows)
# Optional: drop aggregates often tagged as regions/aggregates in the site table
drop_names = {"World","High income","Euro area","Europe & Central Asia","OECD members"}
df = df[~df["Country"].isin(drop_names)]
#df.to_csv("gdp_latest.csv", index=False)

In [3]:
df

Unnamed: 0,Country,Year,GDP (current US$)
0,Africa Eastern and Southern,2024,1.287677e+12
1,Africa Western and Central,2024,6.700257e+11
2,Arab World,2024,3.704768e+12
3,Caribbean small states,2024,9.157080e+10
4,Central Europe and the Baltics,2024,2.455559e+12
...,...,...,...
257,Virgin Islands (U.S.),2022,4.672000e+09
258,West Bank and Gaza,2024,1.371110e+10
259,"Yemen, Rep.",2018,2.160616e+10
260,Zambia,2024,2.632578e+10


In [12]:
exclude_names = [
    "Africa Eastern and Southern",
    "Africa Western and Central",
    "Arab World",
    "Caribbean small states",
    "Central Europe and the Baltics",
    "Early-demographic dividend",
    "East Asia & Pacific",
    "East Asia & Pacific (excluding high income)",
    "East Asia & Pacific (IDA & IBRD countries)",
    "Europe & Central Asia (excluding high income)",
    "Europe & Central Asia (IDA & IBRD countries)",
    "European Union",
    "Fragile and conflict affected situations",
    "Heavily indebted poor countries (HIPC)",
    "IBRD only",
    "IDA & IBRD total",
    "IDA blend",
    "IDA only",
    "IDA total",
    "Late-demographic dividend",
    "Latin America & Caribbean",
    "Latin America & Caribbean (excluding high income)",
    "Latin America & the Caribbean (IDA & IBRD countries)",
    "Least developed countries: UN classification",
    "Low & middle income",
    "Low income",
    "Lower middle income",
    "Middle East, North Africa, Afghanistan & Pakistan",
    "Middle East, North Africa, Afghanistan & Pakistan (excluding high income)",
    "Middle East, North Africa, Afghanistan & Pakistan (IDA & IBRD)",
    "Middle income",
    "North America",
    "Other small states",
    "Pacific island small states",
    "Post-demographic dividend",
    "Pre-demographic dividend",
    "Small states",
    "South Asia",
    "South Asia (IDA & IBRD)",
    "Sub-Saharan Africa",
    "Sub-Saharan Africa (excluding high income)",
    "Sub-Saharan Africa (IDA & IBRD countries)",
    "Upper middle income"
]


df = df[~df["Country"].isin(exclude_names)]

In [13]:
pd.set_option("display.float_format", "{:,.0f}".format)
print(df.head())

           Country  Year  GDP (current US$)  log_GDP  GDP_cluster
48     Afghanistan  2023     17,152,234,637       10            0
49         Albania  2024     27,177,735,528       10            0
50         Algeria  2024    263,619,794,507       11            1
51  American Samoa  2022        871,000,000        9            3
52         Andorra  2024      4,039,844,390       10            3


In [14]:
df["log_GDP"] = np.log10(df["GDP (current US$)"])

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["log_GDP"] = np.log10(df["GDP (current US$)"])


Unnamed: 0,Country,Year,GDP (current US$),log_GDP,GDP_cluster
48,Afghanistan,2023,17152234637,10,0
49,Albania,2024,27177735528,10,0
50,Algeria,2024,263619794507,11,1
51,American Samoa,2022,871000000,9,3
52,Andorra,2024,4039844390,10,3
...,...,...,...,...,...
257,Virgin Islands (U.S.),2022,4672000000,10,3
258,West Bank and Gaza,2024,13711100000,10,0
259,"Yemen, Rep.",2018,21606160663,10,0
260,Zambia,2024,26325775287,10,0


In [15]:
X = df[["log_GDP"]].dropna()
kmeans = KMeans(n_clusters=4, random_state=42)
df.loc[X.index, "GDP_cluster"] = kmeans.fit_predict(X)

In [16]:
df

Unnamed: 0,Country,Year,GDP (current US$),log_GDP,GDP_cluster
48,Afghanistan,2023,17152234637,10,1
49,Albania,2024,27177735528,10,1
50,Algeria,2024,263619794507,11,2
51,American Samoa,2022,871000000,9,0
52,Andorra,2024,4039844390,10,0
...,...,...,...,...,...
257,Virgin Islands (U.S.),2022,4672000000,10,0
258,West Bank and Gaza,2024,13711100000,10,1
259,"Yemen, Rep.",2018,21606160663,10,1
260,Zambia,2024,26325775287,10,1


In [17]:
df.to_csv(r"C:\Users\maxwell.bicking\Downloads\WBGDP2025.csv")