In [1]:
# ─── CELL 1: SETUP + CLEAN + DOWNLOAD RAW XLSX ────────────────────────

# 1) Mount Google Drive (force remount to avoid “already mounted”)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 2) Define & create folder structure
import os, glob
BASE_DIR = '/content/drive/MyDrive/RBI/MI'
RAW_DIR  = os.path.join(BASE_DIR, 'RAW')
PROC_DIR = os.path.join(BASE_DIR, 'Processed')

for d in (RAW_DIR, PROC_DIR):
    os.makedirs(d, exist_ok=True)

# 3) Wipe out any existing files
for f in glob.glob(os.path.join(RAW_DIR, '*')):    os.remove(f)
for f in glob.glob(os.path.join(PROC_DIR, '*.csv')): os.remove(f)

# 4) Install Chromium & Selenium + auto‐driver installer
!apt-get update -y > /dev/null
!apt-get install -y chromium-browser > /dev/null
!pip install selenium chromedriver-autoinstaller > /dev/null

# 5) Launch headless Chrome pointing downloads at RAW_DIR
import chromedriver_autoinstaller, time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

driver_path = chromedriver_autoinstaller.install()
opts = Options()
opts.binary_location = '/usr/bin/chromium-browser'
opts.add_argument('--headless')
opts.add_argument('--no-sandbox')
opts.add_argument('--disable-dev-shm-usage')
opts.add_experimental_option('prefs', {
    'download.default_directory': RAW_DIR,
    'download.prompt_for_download': False,
    'directory_upgrade': True
})

service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=opts)
driver.implicitly_wait(10)

# 6) Navigate & click “50 Macroeconomic Indicators” to drop the .xlsx
driver.get('https://data.rbi.org.in/DBIE/#/dbie/home')
time.sleep(5)
span = driver.find_element(By.XPATH, "//span[text()='50 Macroeconomic Indicators']")
link = span.find_element(By.XPATH, "./parent::a")
driver.execute_script("arguments[0].scrollIntoView(true);", link)
driver.execute_script("arguments[0].click();", link)

# 7) Wait for download, then quit
time.sleep(15)
driver.quit()
print("✅ Raw .xlsx should now be in:", RAW_DIR)


ModuleNotFoundError: No module named 'google'

In [2]:
# ─── CELL 2: PROCESS RAW .XLSX → 4 CSVs + merged.csv (Period = dd-mm-YYYY) ────

import pandas as pd, glob, os

RAW_DIR  = '/content/drive/MyDrive/RBI/MI/RAW'
PROC_DIR = '/content/drive/MyDrive/RBI/MI/Processed'

# 1) Locate the raw .xlsx
files = glob.glob(os.path.join(RAW_DIR, '*.xlsx'))
if not files:
    raise FileNotFoundError(f"No .xlsx found in {RAW_DIR}")
raw_path = files[0]

# 2) Read every sheet, using row 4 as header, drop Unnamed cols
xls = pd.ExcelFile(raw_path, engine='openpyxl')
all_dfs = []

for sheet in xls.sheet_names:
    # Read with 4th row as header
    df = pd.read_excel(xls, sheet_name=sheet, header=3, engine='openpyxl')
    df.columns = df.columns.str.strip()
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Reformat 'Period' → dd-mm-YYYY (string)
    if 'Period' in df.columns:
        df.loc[:, 'Period'] = (
            pd.to_datetime(df['Period'], dayfirst=True, errors='coerce')
              .dt.strftime('%d-%m-%Y')     # hyphens, dd-mm-YYYY
        ).fillna('').astype(str)

    # Tag frequency from sheet name (e.g. "Weekly")
    freq = sheet.split(' - ')[-1].strip()
    df.loc[:, 'Frequency'] = freq

    # Save each slice as CSV
    out_csv = os.path.join(PROC_DIR, f"{freq}.csv")
    df.to_csv(out_csv, index=False)

    all_dfs.append(df)

# 3) Merge all slices and save merged.csv
merged = pd.concat(all_dfs, ignore_index=True)
merged.to_csv(os.path.join(PROC_DIR, 'merged.csv'), index=False)

print("✅ Processing complete. Files in Processed/:", os.listdir(PROC_DIR))


FileNotFoundError: No .xlsx found in /content/drive/MyDrive/RBI/MI/RAW