In [0]:
# ---------------------------------------------------------
# 01_Bronze_Ingestion
# Goal: Ingest raw Lending Club data into the Bronze Delta Layer
# ---------------------------------------------------------

import os
import subprocess

# --- CONFIGURATION ---
# Path to the uploaded zip file in your Unity Catalog Volume
zip_file_path = "/Volumes/workspace/default/raw_data/archive.zip"
# Directory where we will extract the CSV
extract_path = "/Volumes/workspace/default/raw_data/"

print(f"üìÇ Source File: {zip_file_path}")

# --- STEP 1: UNZIP THE DATA ---
# We use a shell command to unzip the file directly in the volume
print("‚è≥ Unzipping file... (This may take 1-2 minutes)...")
try:
    # -n means 'never overwrite' (faster if already unzipped), -d is destination
    subprocess.check_call(f"unzip -n {zip_file_path} -d {extract_path}", shell=True)
    print("‚úÖ Unzip Successful!")
except subprocess.CalledProcessError:
    print("‚ö†Ô∏è  Warning: Unzip might have failed or file already exists.")

# --- STEP 2: IDENTIFY THE CSV ---
# The zip usually contains 'accepted_2007_to_2018Q4.csv' or similar
files = [f for f in os.listdir(extract_path) if f.endswith(".csv")]
if not files:
    raise Exception(f"‚ùå No CSV found in {extract_path}. Check the upload.")

csv_name = files[0]
full_csv_path = os.path.join(extract_path, csv_name)
print(f"üìÑ Found CSV: {full_csv_path}")

# --- STEP 3: WRITE TO DELTA (BRONZE LAYER) ---
# Read CSV with Spark (Optimized for large files)
print("üíæ Reading CSV into Spark DataFrame...")
df_raw = (spark.read
          .format("csv")
          .option("header", "true")
          .option("inferSchema", "true") # Automatically detect Int/Float/String
          .load(full_csv_path))

# Save as a Delta Table (The Industry Standard)
# This creates a permanent table in your Metastore
table_name = "bronze_lending_club"
print(f"üî® Creating Delta Table: {table_name}...")

df_raw.write.format("delta").mode("overwrite").saveAsTable(table_name)

# --- VALIDATION ---
print("-" * 30)
print(f"üöÄ SUCCESS! Table '{table_name}' created.")
print(f"üìä Total Rows: {spark.table(table_name).count():,}")
print("-" * 30)

# Display the first 5 rows to verify
display(spark.table(table_name).limit(5))