In [0]:
import logging
import os
import sys
from datetime import datetime

# Ensure logs folder exists in repo
os.makedirs("logs", exist_ok=True)

# Timestamped log file name
run_ts = datetime.now().strftime("%Y%m%d_%H%M")
log_path = f"logs/run_{run_ts}.log"

# Configure logging (console + file)
logger = logging.getLogger("etl_logger")
logger.setLevel(logging.INFO)
logger.handlers.clear()  

fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")


ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(fmt)


fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(fmt)

logger.addHandler(ch)
logger.addHandler(fh)

logger.info("========== ETL RUN START ==========")
logger.info(f"Log file: {log_path}")
logger.info(f"Python: {sys.version.split()[0]}")


In [0]:
try:
    import pyspark
    logger.info(f"PySpark version: {pyspark.__version__}")
except Exception as e:
    logger.info(f"PySpark not available (likely local Jupyter). Details: {e}")


try:
    ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    logger.info(f"Workspace: {ctx.workspaceId().get()}")
    logger.info(f"Cluster ID: {ctx.clusterId().get()}")
except Exception:
    logger.info("Databricks context not available (ok if running outside Databricks).")


In [0]:
import glob
sorted(glob.glob("logs/run_*.log"))[-3:]