#GOLD Layer for Analytics

In [0]:
from pyspark.sql import functions as F


#Read Silver Table

In [0]:
try:
    silver_df = spark.table("co2_silver.co2_emissions_clean")
except Exception as e:
    print("Failed to read Silver table:", e)
    raise


#1. YEAR-OVER-YEAR EMISSIONS TREND ANALYSIS

In [0]:
# Creation of Yearly Emissions Trend Table
yearly_trend_df = (
    silver_df
    .groupBy("year")
    .agg(
        F.round(
            F.sum("co2_emissions_million_tons"), 2
        ).alias("total_co2_emissions")
    )
    .orderBy("year")
)


Save as GOLD Table

In [0]:
%python
#  create the schema
spark.sql("CREATE SCHEMA IF NOT EXISTS co2_gold")

yearly_trend_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.yearly_emissions_trend")

#2. COUNTRY & REGIONAL EMISSION COMPARISON

In [0]:
# 1.Country-wise Emissions
country_emissions_df = (
    silver_df
    .groupBy("country")
    .agg(
        F.round(
            F.sum("co2_emissions_million_tons"), 2
        ).alias("total_country_emissions")
    )
    .orderBy(F.desc("total_country_emissions"))
)


In [0]:
# Save gold table
country_emissions_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.country_emissions_summary")


In [0]:
# 2.Regional Emissions
regional_emissions_df = (
    silver_df
    .groupBy("region")
    .agg(
        F.round(
            F.sum("co2_emissions_million_tons"), 2
        ).alias("total_region_emissions")
    )
    .orderBy(F.desc("total_region_emissions"))
)


In [0]:
regional_emissions_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.regional_emissions_summary")


#3. IDENTIFICATION OF HIGH-EMISSION REGIONS


In [0]:
# Define High-Emission Threshold
avg_region_emission = regional_emissions_df.select(
    F.avg("total_region_emissions")
).collect()[0][0]
# Filter High-Emission Regions
high_emission_regions_df = regional_emissions_df.filter(
    F.col("total_region_emissions") > avg_region_emission
)


In [0]:
# Save gold table
high_emission_regions_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.high_emission_regions")


#4. CORRELATION: POPULATION vs CO₂ EMISSIONS

In [0]:
# Preparing Population vs Emissions Dataset
population_emission_df = (
    silver_df
    .groupBy("year")
    .agg(
        F.sum("population").alias("total_population"),
        F.sum("co2_emissions_million_tons").alias("total_emissions")
    )
    .orderBy("year")
)


In [0]:
# Calculate Correlation
correlation_value = population_emission_df.stat.corr(
    "total_population",
    "total_emissions"
)

print("Correlation between population growth and CO₂ emissions:", correlation_value)


Correlation between population growth and CO₂ emissions: 0.9526241532943515


In [0]:
# Save gold table
population_emission_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.population_emission_correlation")


#5.Scenario-Based Environmental Analysis

In [0]:
# Aggregate by Scenario
scenario_impact_df = (
    silver_df
    .groupBy("scenario")
    .agg(
        F.round(
            F.sum("co2_emissions_million_tons"), 2
        ).alias("total_co2_emissions"),
        F.round(
            F.avg("co2_per_capita"), 6
        ).alias("avg_co2_per_capita")
    )
)


In [0]:
# Adding Simple Impact Level
scenario_impact_df = scenario_impact_df.withColumn(
    "impact_level",
    F.when(F.col("total_co2_emissions") < 8000, "Low")
     .when(F.col("total_co2_emissions") < 20000, "Medium")
     .otherwise("High")
)


In [0]:
# Save gold table
scenario_impact_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.scenario_environment_impact")


#6.Sector Wise Emissions


In [0]:
from pyspark.sql.functions import sum

sector_emissions_df = (
    silver_df
    .groupBy("year", "sector")
    .agg(
        sum("co2_emissions_million_tons").alias("total_sector_emissions")
    )
)

sector_emissions_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.sector_emissions_summary")


#7.Economics & Emissions Correlation

In [0]:
from pyspark.sql.functions import avg, sum

economic_df = (
    silver_df
    .groupBy("year", "country", "region")
    .agg(
        sum("co2_emissions_million_tons").alias("total_emissions"),
        avg("gdp_billion_usd").alias("avg_gdp_billion"),
        avg("population").alias("avg_population")
    )
)

economic_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("co2_gold.economic_emissions_analysis")
