In [1]:
# 1. Install & mount
!pip -q install --upgrade pandas==2.2.2 requests
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

# 2. Imports & helper to fetch "csvfilewithlabels" (handles ZIP)
import requests, pathlib, io, zipfile, shutil, pandas as pd, re

def fetch_csv(url: str, dest: pathlib.Path):
    dest.parent.mkdir(parents=True, exist_ok=True)
    r = requests.get(url, timeout=180); r.raise_for_status()
    if r.content[:4] == b"PK\x03\x04":              # zipped payload
        with zipfile.ZipFile(io.BytesIO(r.content)) as z:
            name = next(n for n in z.namelist() if n.endswith(".csv"))
            with z.open(name) as src, open(dest, "wb") as tgt:
                shutil.copyfileobj(src, tgt)
    else:
        dest.write_bytes(r.content)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


ModuleNotFoundError: No module named 'google'

In [2]:
BASE = pathlib.Path("/content/drive/MyDrive/OECD/global/economy_external_sector")
CONFIG = [
  # slug,                URL,                                             out_name,    to_mn,  has_currency,  has_measure_col
  ("Balance_of_Payments",
   "https://sdmx.oecd.org/public/rest/data/"
   "OECD.SDD.TPS,DSD_BOP@DF_BOP,1.0/.....Q.XDC.Y"
   "?startPeriod=2024-Q1&dimensionAtObservation=AllDimensions&format=csvfilewithlabels",
   "bop_usd",    True,   True,  True),
  ("GFCF",
   "https://sdmx.oecd.org/public/rest/data/"
   "OECD.SDD.NAD,DSD_NAMAIN10@DF_TABLE1_EXPENDITURE_GFCF_ASSET,2.0/"
   "A....P51G....XDC.V.."
   "?startPeriod=2020&dimensionAtObservation=AllDimensions&format=csvfilewithlabels",
   "gfcf_usd",   True,   True,  False),
  ("PPP",
   "https://sdmx.oecd.org/public/rest/data/"
   "OECD.SDD.NAD,DSD_NAMAIN10@DF_TABLE4,2.0/A....PPP_B1GQ......."
   "?startPeriod=2015&dimensionAtObservation=AllDimensions&format=csvfilewithlabels",
   "ppp_gdp",    False,  True,  False),
  ("CPI",
   "https://sdmx.oecd.org/public/rest/data/"
   "OECD.SDD.TPS,DSD_PRICES@DF_PRICES_ALL,1.0/.Q.N.CPI.._T.N.GY+_Z"
   "?startPeriod=2022-Q1&dimensionAtObservation=AllDimensions&format=csvfilewithlabels",
   "cpi_pct",    False,  False, False),
]

processed = {}

def time_col(df):
    return "TIME_PERIOD" if "TIME_PERIOD" in df.columns else "Time period"

for slug, url, out_name, to_mn, has_cur, has_meas in CONFIG:
    RAW  = BASE/slug/"raw";       PROC = BASE/slug/"processed"
    for d in (RAW, PROC): d.mkdir(parents=True, exist_ok=True)
    # remove old files
    for f in RAW.glob("*.csv"): f.unlink()
    for f in PROC.glob("*.csv"): f.unlink()

    # fetch raw
    raw_f = RAW/f"OECD_{slug}_raw.csv"
    print(f"▶ {slug}: downloading…")
    fetch_csv(url, raw_f)

    df = pd.read_csv(raw_f)
    tc = time_col(df)

    # select & rename
    cols = {"Reference area":"country", tc:"time_period", "OBS_VALUE":out_name}
    if has_cur: cols["Currency"] = "currency"
    if has_meas: cols["Measure"]  = "measure"

    df = df[list(cols)].rename(columns=cols)

    # fill defaults
    if not has_cur:   df["currency"] = "X"
    if not has_meas:  df["measure"]  = "X"

    # units → millions
    if to_mn:
        df[out_name] = df[out_name] / 1e6

    # save processed
    proc_f = PROC/f"OECD_{slug}_clean.csv"
    df.to_csv(proc_f, index=False)
    processed[out_name] = proc_f
    print(f"   ✓ {slug} saved → columns: {list(df.columns)}  rows: {len(df)}")


NameError: name 'pathlib' is not defined

In [3]:
import functools

# load & harmonize
frames = []
for out_name, path in processed.items():
    df = pd.read_csv(path)
    # ensure strings for keys
    for k in ["country","currency","time_period","measure"]:
        df[k] = df[k].astype(str)
    frames.append(df)

# merge on 4 keys
combined = functools.reduce(
    lambda L,R: pd.merge(L, R, on=["country","currency","time_period","measure"], how="outer"),
    frames
)

# reorder / rename as spec
combined = combined[[
    "country","currency","time_period","measure",
    "bop_usd","gfcf_usd","ppp_gdp","cpi_pct"
]].rename(columns={
    "bop_usd":"BoP_Mn",
    "gfcf_usd":"GFCF_Mn",
    "ppp_gdp":"PPP",
    "cpi_pct":"CPI_%PA"
})

# save combined
OUT = BASE/"combined_indicators"; OUT.mkdir(exist_ok=True)
for f in OUT.glob("*.csv"): f.unlink()
combined.to_csv(OUT/"OECD_external_sector_combined.csv", index=False)

print("✅ combined saved →", OUT/"OECD_external_sector_combined.csv",
      "| rows:", len(combined))
combined.head()


NameError: name 'processed' is not defined