In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Bronze Layer - Data Ingestion
# MAGIC Load raw data from Workspace into Bronze layer

# COMMAND ----------

# Import configuration
import sys
sys.path.append("/Workspace/Users/yahyasanbati.mail@gmail.com/GREEN-IT-DATA-PLATFORM/X002_Databricks")
from config import *

import pandas as pd
from datetime import datetime
import os

print("=" * 70)
print("BRONZE LAYER - DATA INGESTION")
print("=" * 70)
print(f"\nSource: {SOURCE_FILE}")
print(f"Destination: {BRONZE_FILE}")

# COMMAND ----------

# Load source data
print("\nLoading source data...")

if os.path.exists(SOURCE_FILE):
    df_bronze = pd.read_parquet(SOURCE_FILE)
    print(f"Loaded {len(df_bronze):,} rows x {len(df_bronze.columns)} columns")
else:
    print(f"ERROR: Source file not found at {SOURCE_FILE}")
    dbutils.notebook.exit("Source file not found")

# COMMAND ----------

# Display data preview
print("\nData Preview:")
display(df_bronze.head(10))

print("\nColumn Names:")
for i, col in enumerate(df_bronze.columns, 1):
    print(f"  {i:2d}. {col}")

# COMMAND ----------

# Add Bronze metadata
print("\nAdding Bronze metadata...")

df_bronze['bronze_ingestion_timestamp'] = datetime.now()
df_bronze['bronze_source'] = 'workspace_github'
df_bronze['bronze_file_path'] = SOURCE_FILE

print("Metadata columns added:")
print("  - bronze_ingestion_timestamp")
print("  - bronze_source")
print("  - bronze_file_path")

# COMMAND ----------

# Data Quality Checks
print("\nData Quality Checks:")
print("=" * 70)

# Check for nulls
null_counts = df_bronze.isnull().sum()
cols_with_nulls = null_counts[null_counts > 0]

if len(cols_with_nulls) > 0:
    print("\nColumns with NULL values:")
    for col, count in cols_with_nulls.items():
        pct = (count / len(df_bronze)) * 100
        print(f"  - {col}: {count:,} ({pct:.2f}%)")
else:
    print("\nNo NULL values found")

# Check duplicates
duplicates = df_bronze.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

print("=" * 70)

# COMMAND ----------

# Save to Bronze
print("\nSaving to Bronze layer...")

os.makedirs(BRONZE_PATH, exist_ok=True)
df_bronze.to_parquet(BRONZE_FILE, index=False, engine='pyarrow')

file_size = os.path.getsize(BRONZE_FILE) / (1024 * 1024)
print(f"Saved to: {BRONZE_FILE}")
print(f"File size: {file_size:.2f} MB")

# COMMAND ----------

# Display using Pandas (NO SPARK CONVERSION)
print("\nFinal Bronze Data:")
display(df_bronze)

# COMMAND ----------

# Bronze validation summary
print("\n" + "=" * 70)
print("BRONZE LAYER VALIDATION")
print("=" * 70)

print(f"\nDataset Statistics:")
print(f"  Total Rows: {len(df_bronze):,}")
print(f"  Total Columns: {len(df_bronze.columns)}")
print(f"  File Size: {file_size:.2f} MB")
print(f"  Location: {BRONZE_FILE}")

print("\nData Types:")
for col, dtype in df_bronze.dtypes.items():
    print(f"  {col}: {dtype}")

print("\n" + "=" * 70)
print("BRONZE LAYER COMPLETE")
print("=" * 70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Bronze Layer Complete
# MAGIC 
# MAGIC Next: Run `02_silver_transformation` notebook