In [0]:
from pyspark.sql.functions import col, explode, input_file_name, current_timestamp, lit, upper, trim, concat, year, month, monthname, datediff

In [0]:
#reading all run_metadata.json and figuring out the latest run_id and reading only that run

runs = dbutils.fs.ls("/Volumes/climate-risk/bronze/fema_raw")
base_path = "/Volumes/climate-risk/bronze/fema_raw/"
for run in runs:
    meta_data_file = f'{base_path}/{run.name}/metadata/run_metadata.json'
    metadata = spark.read.option("multiline","true").json(meta_data_file)


In [0]:
meta_df = spark.read.option("multiline","true")\
    .json(f'{base_path}*/metadata/run_metadata.json')
latest_run = meta_df.filter("status ='SUCCESS'")\
    .orderBy("ingest_end_ts",ascending=False)\
    .limit(1)

latest_run_id = latest_run.select("run_id").collect()[0]['run_id']

In [0]:
bronze_data = spark.read.json(f"{base_path}run_id={latest_run_id}/data")

In [0]:
df = (
    bronze_data
    .select(explode("DisasterDeclarationsSummaries").alias("d"))
    .select("d.*", "_metadata.file_path")
    .withColumnRenamed("_metadata.file_path", "source_file")
    .withColumn("ingest_ts", current_timestamp())
    .withColumn("run_id", lit(latest_run_id))

)

In [0]:
from pyspark.sql.functions import col

date_cols = ["declarationDate", "incidentBeginDate", "incidentEndDate"]
for date_col in date_cols:
    df = df.withColumn(
        date_col,
        col(date_col).cast("date")
    )

In [0]:
df = df.withColumn("fips_code", concat(col("fipsStateCode"),col("fipsCountyCode")))
df = df.withColumn("state", upper(col("state")))
trim_cols = ["designatedArea", "incidentType","declarationType"]
for trim_col in trim_cols:
    df = df.withColumn(trim_col, trim(col(trim_col)))

In [0]:
df = df.withColumn("record_id",concat(col("disasterNumber"),col("state"),col("designatedArea"),col("fips_code")))
#drop duplicates based on record_id
df = df.dropDuplicates(['record_id'])

df = df.withColumn("declarationYear",year(col("declarationDate")))\
    .withColumn("declarationMonth",monthname(col("declarationDate")))\
    .withColumn("incident_duration",datediff(col("incidentEndDate"),col("incidentBeginDate")))
df = df.filter(df.disasterNumber.isNotNull() | df.declarationDate.isNotNull())

In [0]:
write_mode = ""

if spark.catalog.tableExists("`climate-risk`.silver.fema_disaster_declarations"):
    write_mode = "append"
else:
    write_mode = "overwrite"

df.write.format("delta")\
    .mode(write_mode)\
        .saveAsTable("`climate-risk`.silver.fema_disaster_declarations")