In [1]:
!pip install -q --upgrade pandas==2.2.2 requests
import importlib, pandas as pd; importlib.reload(pd)
print("✅ pandas version:", pd.__version__)


✅ pandas version: 2.2.2


World Bank- Economic Indicators

In [2]:
# =======================================================================
# 0. Install deps & mount Google Drive
# =======================================================================
!pip -q install --upgrade pandas==2.2.2 requests   # pin to Colab‑compatible pandas

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# =======================================================================
# 1. Config – indicator list & folder root
# =======================================================================
import pathlib, datetime as dt, requests, zipfile, io, shutil, pandas as pd

TODAY      = dt.date.today().strftime("%Y%m%d")
ROOT       = pathlib.Path('/content/drive/MyDrive/WorldBank/India/Economic_Indicators')

INDICATORS = [
    # (World Bank code, folder‑slug, tidy value‑column name)
    ("NY.GDP.MKTP.CD",     "GDP_total",        "gdp_usd_billions"   ),
    ("NY.GDP.PCAP.CD",     "GDP_per_capita",   "gdp_per_capita_usd" ),
    ("NY.GDP.MKTP.KD.ZG",  "GDP_growth",       "gdp_growth_pct"     ),
    ("FP.CPI.TOTL.ZG",     "Inflation_CPI",    "inflation_pct"      ),
]

# =======================================================================
# 2. Helper: download -> raw CSV -> tidy -> processed CSV
# =======================================================================
def refresh_indicator(code, slug, value_col):
    raw_dir  = ROOT / slug / 'raw'
    proc_dir = ROOT / slug / 'processed'
    raw_dir.mkdir(parents=True, exist_ok=True)
    proc_dir.mkdir(parents=True, exist_ok=True)

    wb_url = f"https://api.worldbank.org/v2/en/indicator/{code}?downloadformat=csv"
    print(f"\n▶ {code}  ➜  {slug}")

    # --- (a) Download ZIP & write ONE raw CSV ---------------------------------
    resp = requests.get(wb_url, timeout=120)
    resp.raise_for_status()

    with zipfile.ZipFile(io.BytesIO(resp.content)) as z:
        csv_name = next(n for n in z.namelist() if n.startswith('API_') and n.endswith('.csv'))
        raw_csv  = raw_dir / f"{code}_raw_{TODAY}.csv"

        # remove any previous raw file
        for f in raw_dir.glob('*.csv'):
            f.unlink()

        with z.open(csv_name) as src, open(raw_csv, 'wb') as tgt:
            shutil.copyfileobj(src, tgt)
    print("   ✓ raw CSV:", raw_csv.name)

    # --- (b) Tidy all countries & years ---------------------------------------
    df = pd.read_csv(raw_csv, skiprows=4)
    df = df.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

    tidy = (
        df.melt(id_vars='Country Name', var_name='year', value_name=value_col)
          .rename(columns={'Country Name': 'country'})
    )
    tidy['year'] = pd.to_numeric(tidy['year'], errors='coerce').astype('Int64')
    tidy[value_col] = pd.to_numeric(tidy[value_col], errors='coerce')
    tidy = tidy.dropna(subset=[value_col]).reset_index(drop=True)

    # --- (c) write ONE processed file -----------------------------------------
    for f in proc_dir.glob('*.csv'):
        f.unlink()

    proc_csv = proc_dir / f"{slug}_{tidy['year'].min()}_{tidy['year'].max()}.csv"
    tidy.to_csv(proc_csv, index=False)
    print("   ✓ processed CSV:", proc_csv.name)
    return raw_dir, proc_dir

# =======================================================================
# 3. Run refresh for every indicator
# =======================================================================
for code, slug, valcol in INDICATORS:
    refresh_indicator(code, slug, valcol)

# =======================================================================
# 4. Folder summary
# =======================================================================
for path in ROOT.iterdir():
    if path.is_dir():
        raws = [p.name for p in (path/'raw').glob('*')]
        pros = [p.name for p in (path/'processed').glob('*')]
        print(f"\n📂 {path.name}\n   raw : {raws}\n   proc: {pros}")


Mounted at /content/drive

▶ NY.GDP.MKTP.CD  ➜  GDP_total


   ✓ raw CSV: NY.GDP.MKTP.CD_raw_20250516.csv
   ✓ processed CSV: GDP_total_1960_2023.csv

▶ NY.GDP.PCAP.CD  ➜  GDP_per_capita


   ✓ raw CSV: NY.GDP.PCAP.CD_raw_20250516.csv
   ✓ processed CSV: GDP_per_capita_1960_2023.csv

▶ NY.GDP.MKTP.KD.ZG  ➜  GDP_growth


   ✓ raw CSV: NY.GDP.MKTP.KD.ZG_raw_20250516.csv
   ✓ processed CSV: GDP_growth_1961_2023.csv

▶ FP.CPI.TOTL.ZG  ➜  Inflation_CPI


   ✓ raw CSV: FP.CPI.TOTL.ZG_raw_20250516.csv
   ✓ processed CSV: Inflation_CPI_1960_2024.csv

📂 GDP_per_capita
   raw : ['NY.GDP.PCAP.CD_raw_20250516.csv']
   proc: ['GDP_per_capita_1960_2023.csv']

📂 GDP_growth
   raw : ['NY.GDP.MKTP.KD.ZG_raw_20250516.csv']
   proc: ['GDP_growth_1961_2023.csv']

📂 Inflation_CPI
   raw : ['FP.CPI.TOTL.ZG_raw_20250516.csv']
   proc: ['Inflation_CPI_1960_2024.csv']

📂 GDP_total
   raw : ['NY.GDP.MKTP.CD_raw_20250516.csv']
   proc: ['GDP_total_1960_2023.csv']


Macroeconomics, Trade & Investment

In [3]:
import pathlib, datetime as dt, requests, zipfile, io, shutil, pandas as pd

# ---------------------------------------------------------------------------
# 1. Config – macro‑trade‑investment indicators
# ---------------------------------------------------------------------------
TODAY   = dt.date.today().strftime("%Y%m%d")
ROOT    = pathlib.Path('/content/drive/MyDrive/WorldBank/India/MacroEconomics_Trade_Investment')

INDICATORS = [
    # code,                                 slug,                      value column
    ("NY.GNP.MKTP.CD",      "GNI_total",                "gni_usd_billions"),
    ("NY.GNP.PCAP.KD.ZG",   "GNI_per_capita_growth",    "gni_per_capita_growth_pct"),
    ("BN.GSR.GNFS.CD",      "Net_trade_goods_services", "net_trade_usd"),
    ("BN.KLT.DINV.CD",      "FDI_net",                  "fdi_net_usd"),
]

# ---------------------------------------------------------------------------
# 2. Helper
# ---------------------------------------------------------------------------
def refresh(code, slug, value_col):
    raw_dir  = ROOT / slug / 'raw'
    proc_dir = ROOT / slug / 'processed'
    raw_dir.mkdir(parents=True, exist_ok=True)
    proc_dir.mkdir(parents=True, exist_ok=True)

    wb_url = f"https://api.worldbank.org/v2/en/indicator/{code}?downloadformat=csv"
    print(f"\n▶ {code}  ➜  {slug}")

    # --- download & save ONE raw CSV ----------------------------------------
    r = requests.get(wb_url, timeout=120)
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        csv_name = next(n for n in z.namelist() if n.startswith('API_') and n.endswith('.csv'))
        raw_csv = raw_dir / f"{code}_raw_{TODAY}.csv"
        # remove older raw files
        for f in raw_dir.glob('*.csv'):
            f.unlink()
        with z.open(csv_name) as src, open(raw_csv, 'wb') as tgt:
            shutil.copyfileobj(src, tgt)
    print("   ✓ raw CSV:", raw_csv.name)

    # --- clean & reshape ----------------------------------------------------
    df = pd.read_csv(raw_csv, skiprows=4)
    df = df.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

    tidy = (
        df.melt(id_vars='Country Name', var_name='year', value_name=value_col)
          .rename(columns={'Country Name': 'country'})
    )
    tidy['year']        = pd.to_numeric(tidy['year'], errors='coerce').astype('Int64')
    tidy[value_col]     = pd.to_numeric(tidy[value_col], errors='coerce')
    tidy                = tidy.dropna(subset=[value_col]).reset_index(drop=True)

    # --- write ONE processed CSV -------------------------------------------
    for f in proc_dir.glob('*.csv'):
        f.unlink()
    proc_csv = proc_dir / f"{slug}_{tidy['year'].min()}_{tidy['year'].max()}.csv"
    tidy.to_csv(proc_csv, index=False)
    print("   ✓ processed CSV:", proc_csv.name)

# ---------------------------------------------------------------------------
# 3. Refresh all four indicators
# ---------------------------------------------------------------------------
for code, slug, valcol in INDICATORS:
    refresh(code, slug, valcol)

# ---------------------------------------------------------------------------
# 4. Quick folder summary
# ---------------------------------------------------------------------------
for p in ROOT.iterdir():
    if p.is_dir():
        raws  = [f.name for f in (p/'raw').glob('*')]
        procs = [f.name for f in (p/'processed').glob('*')]
        print(f"\n📂 {p.name}\n   raw : {raws}\n   proc: {procs}")



▶ NY.GNP.MKTP.CD  ➜  GNI_total


   ✓ raw CSV: NY.GNP.MKTP.CD_raw_20250516.csv
   ✓ processed CSV: GNI_total_1960_2023.csv

▶ NY.GNP.PCAP.KD.ZG  ➜  GNI_per_capita_growth


   ✓ raw CSV: NY.GNP.PCAP.KD.ZG_raw_20250516.csv
   ✓ processed CSV: GNI_per_capita_growth_1961_2023.csv

▶ BN.GSR.GNFS.CD  ➜  Net_trade_goods_services


   ✓ raw CSV: BN.GSR.GNFS.CD_raw_20250516.csv
   ✓ processed CSV: Net_trade_goods_services_1960_2024.csv

▶ BN.KLT.DINV.CD  ➜  FDI_net


   ✓ raw CSV: BN.KLT.DINV.CD_raw_20250516.csv
   ✓ processed CSV: FDI_net_1960_2024.csv

📂 GNI_total
   raw : ['NY.GNP.MKTP.CD_raw_20250516.csv']
   proc: ['GNI_total_1960_2023.csv']

📂 GNI_per_capita_growth
   raw : ['NY.GNP.PCAP.KD.ZG_raw_20250516.csv']
   proc: ['GNI_per_capita_growth_1961_2023.csv']

📂 Net_trade_goods_services
   raw : ['BN.GSR.GNFS.CD_raw_20250516.csv']
   proc: ['Net_trade_goods_services_1960_2024.csv']

📂 FDI_net
   raw : ['BN.KLT.DINV.CD_raw_20250516.csv']
   proc: ['FDI_net_1960_2024.csv']


Merging datasets

In [4]:
# =======================================================================
# Combine all processed CSVs → one wide dataset
# =======================================================================
import pandas as pd, pathlib, datetime as dt, functools

# -----------------------------------------------------------------------
# 1. Map each indicator folder to its final column name
# -----------------------------------------------------------------------
BASE_WB = pathlib.Path('/content/drive/MyDrive/WorldBank/India')

INDICATOR_MAP = {
    # Economic_Indicators
    'GDP_total'               : ('gdp_usd_billions',            'Economic_Indicators'),
    'GDP_per_capita'          : ('gdp_per_capita_usd',          'Economic_Indicators'),
    'GDP_growth'              : ('gdp_growth_pct',              'Economic_Indicators'),
    'Inflation_CPI'           : ('inflation_pct',               'Economic_Indicators'),

    # MacroEconomics_Trade_Investment
    'GNI_total'               : ('gni_usd_billions',            'MacroEconomics_Trade_Investment'),
    'GNI_per_capita_growth'   : ('gni_per_capita_growth_pct',   'MacroEconomics_Trade_Investment'),
    'Net_trade_goods_services': ('net_trade_usd',               'MacroEconomics_Trade_Investment'),
    'FDI_net'                 : ('fdi_net_usd',                 'MacroEconomics_Trade_Investment'),
}

frames = []

for slug, (colname, family) in INDICATOR_MAP.items():
    proc_dir = BASE_WB / family / slug / 'processed'
    csv_files = list(proc_dir.glob('*.csv'))
    if not csv_files:
        print(f"⚠️  No processed CSV found for {slug}. Skipping.")
        continue
    csv_path = csv_files[0]                   # only one by design
    df = pd.read_csv(csv_path)

    # keep only common + value col, rename value
    df = df[['country', 'year', df.columns.difference(['country','year'])[0]]]
    df = df.rename(columns={df.columns[-1]: colname})
    frames.append(df)

    print(f"✓ loaded {slug}: rows={len(df)}  →  column '{colname}'")

# -----------------------------------------------------------------------
# 2. Merge all on (country, year)
# -----------------------------------------------------------------------
def merge_two(left, right):
    return pd.merge(left, right, on=['country','year'], how='outer')

combined = functools.reduce(merge_two, frames)

# ensure consistent order
final_cols = ['country', 'year'] + [INDICATOR_MAP[k][0] for k in INDICATOR_MAP]
combined = combined[final_cols]

print("\nMerged rows:", len(combined))

# -----------------------------------------------------------------------
# 3. Save combined data  ➜  always the same filename
# -----------------------------------------------------------------------
COMB_DIR = BASE_WB / 'Combined_Datasets'
COMB_DIR.mkdir(exist_ok=True)

for f in COMB_DIR.glob('*.csv'):     # remove any earlier combined file(s)
    f.unlink()

out_path = COMB_DIR / "India_combined_indicators.csv"   # ← static name
combined.to_csv(out_path, index=False)

print("🚚 Combined dataset saved →", out_path)


✓ loaded GDP_total: rows=14307  →  column 'gdp_usd_billions'
✓ loaded GDP_per_capita: rows=14312  →  column 'gdp_per_capita_usd'
✓ loaded GDP_growth: rows=13883  →  column 'gdp_growth_pct'


✓ loaded Inflation_CPI: rows=11182  →  column 'inflation_pct'
✓ loaded GNI_total: rows=13416  →  column 'gni_usd_billions'
✓ loaded GNI_per_capita_growth: rows=6719  →  column 'gni_per_capita_growth_pct'
✓ loaded Net_trade_goods_services: rows=7574  →  column 'net_trade_usd'
✓ loaded FDI_net: rows=7078  →  column 'fdi_net_usd'



Merged rows: 14778


🚚 Combined dataset saved → /content/drive/MyDrive/WorldBank/India/Combined_Datasets/India_combined_indicators.csv
