In [0]:
# DataWrangling 
# - upstream : Generate_urls

In [0]:
dbutils.widgets.text("db", "mycatalog", "Database")
catelog = dbutils.widgets.get("db")
dbutils.widgets.text("schema", "hp_prd_data", "Schema")
schema = dbutils.widgets.get("schema")
dbutils.widgets.text("Source_table", "generated_urls")
dbutils.widgets.text("Target_table", "raw_data")
dbutils.widgets.text("authority", "")


In [0]:
config_dict = {
    "db": catelog,
    "schema": schema,
    "Target_table": dbutils.widgets.get("Target_table"),
    "Source_table": dbutils.widgets.get("Source_table")
}

In [0]:
import requests
import json
from datetime import datetime
from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp, hour
from pyspark.sql import Row

In [0]:
headers = {
            "authority": dbutils.widgets.get("authority"),
            "accept": "*/*",
            "accept-language": "en-US,en;q=0.7",
            "referer": "",
            "sec-ch-ua": '"Not A(Brand";v="99", "Brave";v="121", "Chromium";v="121"',  # noqa
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "sec-gpc": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",  # noqa
        }

In [0]:
urls_df = spark.table(f"{config_dict['db']}.{config_dict['schema']}.{config_dict['Source_table']}").select("zip","api_url", "refereral_url")
urls_list = [(row["zip"], row["api_url"], row["refereral_url"]) for row in urls_df.collect()]

In [0]:
raw_records = []

In [0]:
for zipcode, api_url, ref_url in urls_list:
    print("Fetching data for url :", api_url)
    try:
        req_header = headers.copy()
        req_header["referer"] = ref_url
        req_url = api_url
        # sending the request.
        response = requests.get(req_url, headers=req_header)
        print("From server :", response.status_code, "for url -", req_url)
        raw_records.append(Row(
            zipcode=zipcode,
            source_url=api_url,
            raw_json=response.text,
            http_status_code=response.status_code,
            ingestion_timestamp=datetime.utcnow()
        ))

    except Exception as e:
        raw_records.append(Row(
            zipcode=zipcode,
            source_url=api_url,
            raw_json=json.dumps({"error": str(e)}),
            http_status_code=500,
            ingestion_timestamp=datetime.utcnow()
        ))

In [0]:
raw_df = spark.createDataFrame(raw_records)
raw_df = raw_df.withColumn("processed_date", current_timestamp().cast("date")).withColumn("processed_hour", hour(current_timestamp()))

In [0]:
raw_df.write.format("delta").mode("append").saveAsTable(f"{config_dict['db']}.{config_dict['schema']}.{config_dict['Target_table']}")