In [0]:
import logging
import os
import sys
from datetime import datetime

# Ensure logs folder exists in repo
os.makedirs("logs", exist_ok=True)

# Timestamped log file name
run_ts = datetime.now().strftime("%Y%m%d_%H%M")
log_path = f"logs/run_{run_ts}.log"

# Configure logging (console + file)
logger = logging.getLogger("etl_logger")
logger.setLevel(logging.INFO)
logger.handlers.clear()  

fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")


ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(fmt)


fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(fmt)

logger.addHandler(ch)
logger.addHandler(fh)

logger.info("========== ETL RUN START ==========")
logger.info(f"Log file: {log_path}")
logger.info(f"Python: {sys.version.split()[0]}")


In [0]:
try:
    import pyspark
    logger.info(f"PySpark version: {pyspark.__version__}")
except Exception as e:
    logger.info(f"PySpark not available (likely local Jupyter). Details: {e}")


try:
    ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    logger.info(f"Workspace: {ctx.workspaceId().get()}")
    logger.info(f"Cluster ID: {ctx.clusterId().get()}")
except Exception:
    logger.info("Databricks context not available (ok if running outside Databricks).")


In [0]:
import glob
sorted(glob.glob("logs/run_*.log"))[-3:]

In [0]:
import os
import random
import numpy as np

os.environ["PYTHONHASHSEED"] = "0"
random.seed(0)
np.random.seed(0)

logger.info("Reproducibility: seeds set (PYTHONHASHSEED=0, random=0, numpy=0)")

In [0]:
%pip freeze > requirements.txt

In [0]:
import hashlib
import json
from pathlib import Path

def sha256_file(path: str) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

files = ["data/menu_items.csv", "data/order_details.csv"]
hashes = {}

for fp in files:
    p = Path(fp)
    if not p.exists():
        raise FileNotFoundError(f"Missing required file: {fp}")
    hashes[p.name] = sha256_file(fp)

with open("data_hashes.json", "w") as f:
    json.dump(hashes, f, indent=2)

logger.info("Wrote data_hashes.json with SHA-256 hashes:")
logger.info(hashes)


In [0]:
import os
print("requirements.txt exists:", os.path.exists("requirements.txt"))
print("data_hashes.json exists:", os.path.exists("data_hashes.json"))
print("logs folder exists:", os.path.exists("logs"))