# Emissions 01: Bronze Data & Filters Testing

**Purpose**: Test reading One BI premium data and applying business filters

**Tests**:
1. Read rf_fr1_prm_dtl_midcorp_m from bronze
2. Apply exclusions (intermediaries, guarantees, categories)
3. Verify filter impact

---

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))
print(f"Project root: {project_root}")

In [None]:
from pyspark.sql import SparkSession
from azfr_fsspec_utils import fspath
import azfr_fsspec_abfs

azfr_fsspec_abfs.use()

spark = SparkSession.builder \
    .appName("Emissions_Testing") \
    .getOrCreate()

print(f"✓ Spark {spark.version}")

## 1. Load Configuration

In [None]:
from utils.loaders.config_loader import ConfigLoader
from src.reader import BronzeReader
import json

config = ConfigLoader(str(project_root / "config" / "config.yml"))
bronze_reader = BronzeReader(
    spark, config,
    str(project_root / "config" / "reading_config.json")
)

# Load emissions exclusions
with open(project_root / "config" / "transformations" / "emissions_config.json") as f:
    emissions_config = json.load(f)

print("Exclusions loaded:")
print(f"  Intermediaries: {len(emissions_config['excluded_intermediaries'])}")
print(f"  Guarantees: {emissions_config['excluded_guarantees']}")
print(f"  Categories: {emissions_config['excluded_categories']}")

## 2. Read One BI Premium Data

In [None]:
VISION = "202509"

try:
    df = bronze_reader.read_file_group('onebi_emissions', VISION)
    print(f"✓ Read {df.count():,} rows")
    print(f"  Columns: {len(df.columns)}")
    df.select('nopol', 'cdprod', 'noint', 'cdgarp').show(5)
except Exception as e:
    print(f"⚠ Error reading data: {e}")
    df = None

## 3. Apply Exclusion Filters

In [None]:
from pyspark.sql.functions import col

if df is not None:
    count_before = df.count()
    
    # Filter 1: Excluded intermediaries
    df_f1 = df.filter(~col('noint').isin(emissions_config['excluded_intermediaries']))
    print(f"After intermediary filter: {df_f1.count():,}")
    
    # Filter 2: Excluded guarantees
    df_f2 = df_f1.filter(~col('cdgarp').isin(emissions_config['excluded_guarantees']))
    print(f"After guarantee filter: {df_f2.count():,}")
    
    # Filter 3: Excluded categories
    df_f3 = df_f2.filter(~col('cdcateg').isin(emissions_config['excluded_categories']))
    print(f"After category filter: {df_f3.count():,}")
    
    count_after = df_f3.count()
    print(f"\nTotal: {count_before:,} → {count_after:,} ({(count_before-count_after):,} filtered)")
    
    df_filtered = df_f3
else:
    print("⚠ No data to filter")

## Summary

In [None]:
print("="*60)
print("EMISSIONS BRONZE TESTING COMPLETE")
print("="*60)
print("\n→ Next: Notebook 02 - Full Pipeline")