In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, year, month, dayofyear, hour, day
from pyspark.sql.types import TimestampType
from astral.sun import sun
from astral import Observer, LocationInfo
import pytz
import pandas as pd

# this needs to be executed to access ACID functionality and manipulate with tables in unity catalog
spark = SparkSession.getActiveSession()

# new feature from astral library
location = LocationInfo(name="Alpnach", region="Switzerland", timezone="Europe/Zurich", latitude=46.94, longitude=8.28)
observer = Observer(latitude=46.94, longitude=8.28, elevation=450)
tz = pytz.timezone(location.timezone)


# get all bronye tables by name month* 
bronze_tables = spark.sql("SHOW TABLES IN model_workspace.bronze") \
    .filter("tableName LIKE 'month%'") \
    .select("tableName").rdd.flatMap(lambda x: x).collect()


# check if the silver table is saved in silver schema if not it is new data and it should be saved
for table_name in bronze_tables:

    bronze_table = f"model_workspace.bronze.{table_name}"
    silver_table = f"model_workspace.silver.{table_name}_features"
    if spark._jsparkSession.catalog().tableExists(silver_table):
        print(f"skipping (already exists):{silver_table}")
        continue

    print(f"Processing:{bronze_table} to {silver_table}")

    try:
        df = spark.read.table(bronze_table)

        if "ParsedDateTime" not in df.columns:
            print(f"skipping {table_name}   missing ParsedDateTime column")
            continue

        # had issue with wrong column name when uploading data to server this solved it
        df = df.withColumn("DateTime", col("ParsedDateTime").cast(TimestampType()))
        df = df.drop("ParsedDateTime")

        # extracting time features to manipulate them later
        df = df.dropna(subset=["DateTime", "Irradiance"])
        df = df.withColumn("UnixTime", unix_timestamp("DateTime")) \
               .withColumn("Month", month("DateTime")) \
               .withColumn("DayOfTheYear", dayofyear("DateTime")) \
               .withColumn("Day", day("DateTime")) \
               .withColumn("Hour", hour("DateTime")) \
               .withColumn("Year", year("DateTime"))


        pandas_df = df.toPandas()
        pandas_df['DateTime'] = pd.to_datetime(pandas_df['DateTime'])
        pandas_df['Date'] = pandas_df['DateTime'].dt.date

        # compuute DayLength once per day and not for every row saving time and compute cost
        def compute_daylength(date):
            try:
                s = sun(observer, date=date, tzinfo=tz)
                return (s['sunset'] - s['sunrise']).seconds / 3600 # in hours because model wont wrk well with seconds, too big values
            except Exception:
                return None

        # done with chatgpt couldnt make it work with my code ... this is cleaner as well
        daylength_map = {d: compute_daylength(d) for d in pandas_df['Date'].drop_duplicates()}
        pandas_df['DayLength'] = pandas_df['Date'].map(daylength_map)


        pandas_df.drop(columns=["Year", "Date", "PictureName"], inplace=True)

        #  save  to silver schema
        silver_df = spark.createDataFrame(pandas_df)
        silver_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)

        print(f"wrote silver table:{silver_table}")

    except Exception as e:
        print(f"err {table_name}: {e}")

