In [0]:
df = (
    spark.read.format("csv")
         .option("header", "true")
         .load("dbfs:/Volumes/methane/raw/emissions/")
)

df.display()


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType

# 1️⃣ Load CSV
df = (
    spark.read.format("csv")
         .option("header", "true")
         .load("dbfs:/Volumes/methane/raw/emissions/")
)

# 2️⃣ List of year columns
year_cols = [str(y) for y in range(1990, 2019)]

# 3️⃣ Melt wide table to long format safely using try_cast
exprs = [
    F.struct(
        F.lit(c).alias("Year"),
        F.expr(f"try_cast(`{c}` as float)").alias("Emission")
    )
    for c in year_cols
]

df_long = df.withColumn("Year_Emission", F.explode(F.array(*exprs)))
df_long = df_long.withColumn("Year", F.col("Year_Emission.Year").cast(IntegerType())) \
                 .withColumn("Emission", F.col("Year_Emission.Emission")) \
                 .drop("Year_Emission")

# 4️⃣ Fill missing/null emissions
df_long = df_long.fillna({"Emission": 0})

# 5️⃣ Select only relevant columns
df_long = df_long.select("Country", "Sector", "Year", "Emission")

# 6️⃣ Show final dataset
df_long.show(5)

