<a href="https://colab.research.google.com/github/AayushJindal1/NRI-ISCG/blob/main/RAW_file_fetch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install -q --upgrade pandas==2.2.2 requests
import importlib, pandas as pd; importlib.reload(pd)
print("✅ pandas version:", pd.__version__)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[?25h✅ pandas version: 2.2.2


GDP

In [13]:
import requests, zipfile, io, pathlib, datetime as dt, pandas as pd, shutil

# ---------------------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------------------
WB_URL   = "https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.CD?downloadformat=csv"
today    = dt.date.today().strftime("%Y%m%d")
RAW_DIR  = pathlib.Path('/content/drive/MyDrive/WorldBank/India/GDP/raw')
PROC_DIR = pathlib.Path('/content/drive/MyDrive/WorldBank/India/GDP/processed')

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------------------
# 1. Download ZIP into memory  →  extract CSV to RAW_DIR
# ---------------------------------------------------------------------------
print("▶ Fetching ZIP …")
resp = requests.get(WB_URL, timeout=120)
resp.raise_for_status()

with zipfile.ZipFile(io.BytesIO(resp.content)) as z:
    csv_name = next(n for n in z.namelist() if n.startswith('API_') and n.endswith('.csv'))
    raw_csv_path = RAW_DIR / f"gdp_raw_{today}.csv"
    with z.open(csv_name) as src, open(raw_csv_path, 'wb') as tgt:
        shutil.copyfileobj(src, tgt)
print("✓ Extracted CSV →", raw_csv_path.name)

# ---------------------------------------------------------------------------
# 2. Clean / reshape (all countries, all years)
# ---------------------------------------------------------------------------
df = pd.read_csv(raw_csv_path, skiprows=4)                      # skip WB metadata lines
df = df.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

tidy = (
    df.melt(id_vars='Country Name', var_name='year', value_name='gdp_usd_billions')
      .rename(columns={'Country Name': 'country'})
)
tidy['year']              = pd.to_numeric(tidy['year'], errors='coerce').astype('Int64')
tidy['gdp_usd_billions']  = pd.to_numeric(tidy['gdp_usd_billions'], errors='coerce')
tidy = tidy.dropna(subset=['gdp_usd_billions']).reset_index(drop=True)

# ---------------------------------------------------------------------------
# 3. Keep processed folder clean: delete old file(s) then save the new one
# ---------------------------------------------------------------------------
for old in PROC_DIR.glob('*.csv'):
    old.unlink()

proc_file = PROC_DIR / f"global_gdp_{tidy['year'].min()}_{tidy['year'].max()}.csv"
tidy.to_csv(proc_file, index=False)
print("🚚 Clean CSV saved →", proc_file.name)

# ---------------------------------------------------------------------------
# 4. Remove ZIP from memory (already gone)  | RAW_DIR now holds just raw CSV
# ---------------------------------------------------------------------------
print("\nRAW folder contents:", [p.name for p in RAW_DIR.glob('*')])
print("PROC folder contents:", [p.name for p in PROC_DIR.glob('*')])


▶ Fetching ZIP …
✓ Extracted CSV → gdp_raw_20250513.csv
🚚 Clean CSV saved → global_gdp_1960_2023.csv

RAW folder contents: ['gdp_raw_20250513.csv']
PROC folder contents: ['global_gdp_1960_2023.csv']


GDP PER CAPITA

In [14]:
import requests, zipfile, io, pathlib, datetime as dt, pandas as pd, shutil

# ---------------------------------------------------------------------------
# CONFIG for “GDP per Capita”
# ---------------------------------------------------------------------------
CODE       = "NY.GDP.PCAP.CD"
IND_FOLDER = "GDP_per_capita"               # folder-friendly slug
WB_URL     = f"https://api.worldbank.org/v2/en/indicator/{CODE}?downloadformat=csv"

today   = dt.date.today().strftime("%Y%m%d")
BASE    = pathlib.Path('/content/drive/MyDrive/WorldBank')
RAW_DIR = BASE / 'India' / IND_FOLDER / 'raw'         # keep country in path if you like
PROC_DIR= BASE / 'India' / IND_FOLDER / 'processed'

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------------------
# 1. Download ZIP → save raw CSV (delete ZIP)
# ---------------------------------------------------------------------------
print(f"\n▶ Fetching {CODE} ZIP …")
resp = requests.get(WB_URL, timeout=120)
resp.raise_for_status()

with zipfile.ZipFile(io.BytesIO(resp.content)) as z:
    csv_name = next(n for n in z.namelist() if n.startswith('API_') and n.endswith('.csv'))
    raw_csv_path = RAW_DIR / f"{CODE}_raw_{today}.csv"
    with z.open(csv_name) as src, open(raw_csv_path, 'wb') as tgt:
        shutil.copyfileobj(src, tgt)
print("✓ Extracted →", raw_csv_path.name)

# ---------------------------------------------------------------------------
# 2. Clean / reshape (all countries, all years)
# ---------------------------------------------------------------------------
df = pd.read_csv(raw_csv_path, skiprows=4)
df = df.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

tidy = (
    df.melt(id_vars='Country Name', var_name='year', value_name='gdp_per_capita_usd')
      .rename(columns={'Country Name': 'country'})
)

tidy['year']              = pd.to_numeric(tidy['year'], errors='coerce').astype('Int64')
tidy['gdp_per_capita_usd']= pd.to_numeric(tidy['gdp_per_capita_usd'], errors='coerce')
tidy = tidy.dropna(subset=['gdp_per_capita_usd']).reset_index(drop=True)

# ---------------------------------------------------------------------------
# 3. Keep processed folder clean: delete old & write one fresh file
# ---------------------------------------------------------------------------
for old in PROC_DIR.glob('*.csv'):
    old.unlink()

proc_file = PROC_DIR / f"global_gdp_per_capita_{tidy['year'].min()}_{tidy['year'].max()}.csv"
tidy.to_csv(proc_file, index=False)
print("🚚 Clean per‑capita GDP saved →", proc_file.name)

# ---------------------------------------------------------------------------
print("RAW  :", [p.name for p in RAW_DIR.glob('*')])
print("PROC :", [p.name for p in PROC_DIR.glob('*')])



▶ Fetching NY.GDP.PCAP.CD ZIP …
✓ Extracted → NY.GDP.PCAP.CD_raw_20250513.csv
🚚 Clean per‑capita GDP saved → global_gdp_per_capita_1960_2023.csv
RAW  : ['NY.GDP.PCAP.CD_raw_20250513.csv']
PROC : ['global_gdp_per_capita_1960_2023.csv']


GDP GROWTH

In [15]:
import requests, zipfile, io, pathlib, datetime as dt, pandas as pd, shutil

# ---------------------------------------------------------------------------
# CONFIG  ➜ GDP Growth (% annual) --------------------------------------------
# ---------------------------------------------------------------------------
CODE        = "NY.GDP.MKTP.KD.ZG"
IND_FOLDER  = "GDP_growth"                       # folder-friendly slug
WB_URL      = f"https://api.worldbank.org/v2/en/indicator/{CODE}?downloadformat=csv"

today    = dt.date.today().strftime("%Y%m%d")
BASE     = pathlib.Path('/content/drive/MyDrive/WorldBank')
RAW_DIR  = BASE / 'India' / IND_FOLDER / 'raw'
PROC_DIR = BASE / 'India' / IND_FOLDER / 'processed'

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------------------
# 1. Download ZIP → save raw CSV (delete ZIP contents thereafter)
# ---------------------------------------------------------------------------
print(f"\n▶ Fetching {CODE} ZIP …")
resp = requests.get(WB_URL, timeout=120)
resp.raise_for_status()

with zipfile.ZipFile(io.BytesIO(resp.content)) as z:
    csv_name = next(n for n in z.namelist() if n.startswith('API_') and n.endswith('.csv'))
    raw_csv_path = RAW_DIR / f"{CODE}_raw_{today}.csv"
    with z.open(csv_name) as src, open(raw_csv_path, 'wb') as tgt:
        shutil.copyfileobj(src, tgt)
print("✓ Extracted →", raw_csv_path.name)

# ---------------------------------------------------------------------------
# 2. Clean / reshape for *all* countries & years
# ---------------------------------------------------------------------------
df = pd.read_csv(raw_csv_path, skiprows=4)
df = df.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

tidy = (
    df.melt(id_vars='Country Name', var_name='year', value_name='gdp_growth_pct')
      .rename(columns={'Country Name': 'country'})
)

tidy['year']            = pd.to_numeric(tidy['year'], errors='coerce').astype('Int64')
tidy['gdp_growth_pct']  = pd.to_numeric(tidy['gdp_growth_pct'], errors='coerce')
tidy = tidy.dropna(subset=['gdp_growth_pct']).reset_index(drop=True)

# ---------------------------------------------------------------------------
# 3. Keep processed folder clean → delete old, write new
# ---------------------------------------------------------------------------
for old in PROC_DIR.glob('*.csv'):
    old.unlink()

proc_file = PROC_DIR / f"global_gdp_growth_{tidy['year'].min()}_{tidy['year'].max()}.csv"
tidy.to_csv(proc_file, index=False)
print("🚚 Clean GDP‑growth CSV saved →", proc_file.name)

# ---------------------------------------------------------------------------
print("RAW  :", [p.name for p in RAW_DIR.glob('*')])
print("PROC :", [p.name for p in PROC_DIR.glob('*')])



▶ Fetching NY.GDP.MKTP.KD.ZG ZIP …
✓ Extracted → NY.GDP.MKTP.KD.ZG_raw_20250513.csv
🚚 Clean GDP‑growth CSV saved → global_gdp_growth_1961_2023.csv
RAW  : ['NY.GDP.MKTP.KD.ZG_raw_20250513.csv']
PROC : ['global_gdp_growth_1961_2023.csv']


INFLATION

In [16]:
import requests, zipfile, io, pathlib, datetime as dt, pandas as pd, shutil

# ---------------------------------------------------------------------------
# CONFIG  ➜ Inflation (% annual CPI) -----------------------------------------
# ---------------------------------------------------------------------------
CODE        = "FP.CPI.TOTL.ZG"
IND_FOLDER  = "Inflation_CPI"                       # folder-friendly slug
WB_URL      = f"https://api.worldbank.org/v2/en/indicator/{CODE}?downloadformat=csv"

today    = dt.date.today().strftime("%Y%m%d")
BASE     = pathlib.Path('/content/drive/MyDrive/WorldBank')
RAW_DIR  = BASE / 'India' / IND_FOLDER / 'raw'
PROC_DIR = BASE / 'India' / IND_FOLDER / 'processed'

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------------------
# 1. Download ZIP → save raw CSV
# ---------------------------------------------------------------------------
print(f"\n▶ Fetching {CODE} ZIP …")
resp = requests.get(WB_URL, timeout=120)
resp.raise_for_status()

with zipfile.ZipFile(io.BytesIO(resp.content)) as z:
    csv_name = next(n for n in z.namelist() if n.startswith('API_') and n.endswith('.csv'))
    raw_csv_path = RAW_DIR / f"{CODE}_raw_{today}.csv"
    with z.open(csv_name) as src, open(raw_csv_path, 'wb') as tgt:
        shutil.copyfileobj(src, tgt)
print("✓ Extracted →", raw_csv_path.name)

# ---------------------------------------------------------------------------
# 2. Clean / reshape (all countries, all years)
# ---------------------------------------------------------------------------
df = pd.read_csv(raw_csv_path, skiprows=4)
df = df.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

tidy = (
    df.melt(id_vars='Country Name', var_name='year', value_name='inflation_pct')
      .rename(columns={'Country Name': 'country'})
)

tidy['year']          = pd.to_numeric(tidy['year'], errors='coerce').astype('Int64')
tidy['inflation_pct'] = pd.to_numeric(tidy['inflation_pct'], errors='coerce')
tidy = tidy.dropna(subset=['inflation_pct']).reset_index(drop=True)

# ---------------------------------------------------------------------------
# 3. Keep processed folder clean: delete old & save new
# ---------------------------------------------------------------------------
for old in PROC_DIR.glob('*.csv'):
    old.unlink()

proc_file = PROC_DIR / f"global_inflation_{tidy['year'].min()}_{tidy['year'].max()}.csv"
tidy.to_csv(proc_file, index=False)
print("🚚 Clean inflation CSV saved →", proc_file.name)

# ---------------------------------------------------------------------------
print("RAW  :", [p.name for p in RAW_DIR.glob('*')])
print("PROC :", [p.name for p in PROC_DIR.glob('*')])



▶ Fetching FP.CPI.TOTL.ZG ZIP …
✓ Extracted → FP.CPI.TOTL.ZG_raw_20250513.csv
🚚 Clean inflation CSV saved → global_inflation_1960_2024.csv
RAW  : ['FP.CPI.TOTL.ZG_raw_20250513.csv']
PROC : ['global_inflation_1960_2024.csv']
