In [0]:
df = (
    spark.read.format("csv")
         .option("header", "true")
         .load("dbfs:/Volumes/methane/raw/emissions/")
)

df.display()


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, when
from pyspark.sql.types import DoubleType, IntegerType

# 1️⃣ Create Spark session
spark = SparkSession.builder \
    .appName("Emissions Preprocessing") \
    .getOrCreate()

# 2️⃣ Load your CSV file (replace path with your file)
df = (
    spark.read.format("csv")
         .option("header", "true")
         .load("dbfs:/Volumes/methane/raw/emissions/")
)

df.display()

# 3️⃣ Trim string columns
string_cols = ['region', 'country', 'type', 'segment', 'reason']
for c in string_cols:
    df = df.withColumn(c, trim(col(c)))

# 4️⃣ Convert 'emissions' to double safely
df = df.withColumn("emissions", when(col("emissions").rlike("^[0-9.]+$"), col("emissions").cast(DoubleType())).otherwise(None))

# 5️⃣ Convert 'baseYear' to integer
# - If a single year, keep it as int
# - If interval like '2019-2021', take the average year and round
from pyspark.sql.functions import split, floor

df = df.withColumn(
    "baseYear",
    when(col("baseYear").rlike("^[0-9]{4}$"), col("baseYear").cast(IntegerType()))
    .when(col("baseYear").rlike("^[0-9]{4}-[0-9]{4}$"),
          floor((split(col("baseYear"), "-").getItem(0).cast(IntegerType()) + 
                 split(col("baseYear"), "-").getItem(1).cast(IntegerType())) / 2))
    .otherwise(None)
)

# 6️⃣ Drop unnecessary columns
df = df.drop("_c0", "notes")  # drop index and notes columns

# 7️⃣ Show preprocessed DataFrame
df.show(10, truncate=False)
