# Bronze Layer – CO₂ Emissions Data Validation

In [0]:
%sql
SHOW TABLES IN co2_bronze;

database,tableName,isTemporary
co2_bronze,co_2_emissions_raw,False


#Read the Bronze Table


In [0]:
# Spark reads the managed Bronze table
bronze_df = spark.table(
    "workspace.co2_bronze.co_2_emissions_raw"
)
display(bronze_df)

country,region,year,sector,population,gdp_billion_usd,co2_emissions_million_tons,scenario
Canada,North America,2003,Residential,804261004.0,1116807.1930317215,1460.1,Policy_Reduction
Italy,Europe,2009,Transport,1097212890.0,2143141.14,4972.24,Baseline
Mexico,North America,2005,Industry,277845589.0,223462.33241965136,830.11,Policy_Reduction
India,Asia,1997,Transport,,1563310.68,2217.2,Baseline
Australia,Oceania,2001,Transport,,247242.95,1713.42,Baseline
Russia,Europe,2013,Industry,1474389018.0,2350898.333060052,,Policy_Reduction
Australia,Oceania,2007,Transport,642659702.0,282652.6,1670.36,Baseline
Italy,Europe,1998,Residential,925564796.0,1345878.282579947,1749.51,High_Growth
Russia,Europe,1999,Transport,1289238809.0,1655598.1777526625,637.09,Renewable_Transition
Russia,Europe,2005,Energy,1322613811.0,2526184.7994336616,2392.11,High_Growth


#Basic Row Count Check

In [0]:
total_records = bronze_df.count()
print(f"Total records in Bronze table: {total_records}")

Total records in Bronze table: 18000


#Inspect Schema

In [0]:
bronze_df.printSchema()

root
 |-- country: string (nullable = true)
 |-- region: string (nullable = true)
 |-- year: long (nullable = true)
 |-- sector: string (nullable = true)
 |-- population: double (nullable = true)
 |-- gdp_billion_usd: double (nullable = true)
 |-- co2_emissions_million_tons: double (nullable = true)
 |-- scenario: string (nullable = true)



#Validate Time-Series Range (Year Column)

In [0]:
bronze_df.selectExpr(
    "min(year) as min_year",
    "max(year) as max_year"
).show()

+--------+--------+
|min_year|max_year|
+--------+--------+
|    1995|    2024|
+--------+--------+



#Validate Numeric Columns

In [0]:
from pyspark.sql.functions import col

numeric_columns = [
    "population",
    "gdp_billion_usd",
    "co2_emissions_million_tons"
]

for col_name in numeric_columns:
    invalid_count = bronze_df.filter(col(col_name) < 0).count()
    print(f"{col_name} - negative values count: {invalid_count}")

population - negative values count: 0
gdp_billion_usd - negative values count: 0
co2_emissions_million_tons - negative values count: 0


#Null Count Analysis

In [0]:
from pyspark.sql.functions import count, when, col

null_counts = [
    count(
        when(col(c).isNull(), c)
    ).alias(c)
    for c in bronze_df.columns
]

display(
    bronze_df.select(null_counts)
)

country,region,year,sector,population,gdp_billion_usd,co2_emissions_million_tons,scenario
0,0,0,0,2060,2048,2056,0
