## Import libraries

In [0]:
from pyspark.sql.functions import col, when, udf
from pyspark.sql.types import StringType
import reverse_geocoder as rg
from datetime import date, timedelta


## Catch previous jobs information

In [0]:
# Catch the data from bronze layer
bronze_output = dbutils.jobs.taskValues.get(taskKey= "Bronze", key="bronze_output")
# Catch the data from silver layer
silver_data = dbutils.jobs.taskValues.get(taskKey= "Silver", key="silver_output")

# Access individual variables
start_date = bronze_output.get("start_date", "")
silver_adls = bronze_output.get("silver_adls", "")
gold_adls = bronze_output.get("gold_adls", "")

print(f"Start Date: {start_date}, Gold ADLS: {gold_adls}")

## Filter data (save resources)

In [0]:
# Extract a certain date range from the silver layer
df = spark.read.parquet(silver_data).filter(col("time") > start_date)

In [0]:
# Limit the number of rows
df = df.limit(10)

## Creating new features

In [0]:
def get_country_code(lat, lon):
    """
    Retrieve the country code for a given latitude and longitude.

    Parameters:
    lat (float or str): Latitude of the location.
    lon (float or str): Longitude of the location.

    Returns:
    str: Country code of the location, retrieved using the reverse geocoding API.
    """
    try:
        coord = (float(lat), float(lon))
        result = rg.search(coord)[0].get('cc')
        print(f"Processed coordinates: {coord} -> {result}")
        return result
    except Exception as e:
        print(f"Error processing coordinates: {coord} -> {str(e)}")
        return None
        

In [0]:
# register the UDF to using Python fx in Spark DataFrames
get_country_code_udf = udf(get_country_code, StringType())

In [0]:
# Adding the country code column to the DataFrame
df_with_location = \
    df.\
        withColumn("country_code", get_country_code_udf(col("latitude"), col("longitude")))

In [0]:
# Adding the sig_class column to the DataFrame to represent the severity of the earthquake
df_with_location_sig_class = \
    df_with_location.\
        withColumn("sig_class",
                   when(col("sig") < 100, "minor").\
                       when((col("sig") >= 100) & (col("sig") < 500), "Moderate").\
                           otherwise("High") 
                   )

In [0]:
df_with_location_sig_class.head()

## Store data on Azure

In [0]:
# gold layer path on Azure
gold_output_path = f"{gold_adls}earthquake_events_gold/"

In [0]:
# Append DataFrame to gold container in Parquet format
df_with_location_sig_class.write.mode("append").parquet(gold_output_path)