In [33]:
import pyspark
import os
import pyspark.sql.functions as F
from dotenv import load_dotenv
from delta import * 
from pyspark.sql.types import DecimalType

In [34]:
load_dotenv('.././.env')
access_key = os.getenv("API_KEY")
secret_key = os.getenv("SECRET_ACCESS_KEY")
print(access_key)

AKIAZQ3DTLU6MHKI5HLC


In [35]:
conf = (
    pyspark.conf.SparkConf()
    .setAppName("MY_APP")
    .set(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .set("spark.hadoop.fs.s3a.access.key", access_key)
    .set("spark.hadoop.fs.s3a.secret.key", secret_key)
    .set("spark.hadoop.fs.s3a.region", "ap-southeast-2")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.sql.shuffle.partitions", "4")
    .setMaster(
        "local[*]"
    )  # replace the * with your desired number of cores. * for use all.
)

extra_packages = [
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "org.apache.hadoop:hadoop-common:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
]

builder = pyspark.sql.SparkSession.builder.appName("MyApp").config(conf=conf)
spark = configure_spark_with_delta_pip(
    builder, extra_packages=extra_packages
).getOrCreate()

In [None]:
df = spark.read.format("delta").load("s3a://vdt2025/menu_bronze")

In [37]:
df.printSchema()

root
 |-- menu_id: integer (nullable = true)
 |-- menu_page_id: integer (nullable = true)
 |-- dish_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- high_price: double (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- xpos: double (nullable = true)
 |-- ypos: double (nullable = true)
 |-- dish_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- menus_appeared: string (nullable = true)
 |-- times_appeared: string (nullable = true)
 |-- first_appeared: string (nullable = true)
 |-- last_appeared: string (nullable = true)
 |-- lowest_price: string (nullable = true)
 |-- highest_price: string (nullable = true)
 |-- page_number: integer (nullable = true)
 |-- image_id: string (nullable = true)
 |-- full_height: integer (nullable = true)
 |-- full_width: integer (nullable = true)
 |-- uuid: string (nullable = true)
 |-- name: string (nullable = t

In [38]:
df = df.withColumn('date',F.when(F.col('date').rlike(r"^\d{4}-\d{2}_\d{2}$"), F.col('date')).otherwise(None))

In [39]:
df = df.withColumn('date', F.to_date("date", "yyyy-MM-dd")) \
        .withColumn('dish_count', F.col('dish_count').cast("int")) \
        .withColumn('page_count', F.col('page_count').cast("int")) \
        .withColumn('last_appeared' , F.col('last_appeared').cast("int")) \
        .withColumn('first_appeared' , F.col('first_appeared').cast("int")) \
        .withColumn('times_appeared' , F.col('times_appeared').cast("int")) \
        .withColumn('highest_price' , F.col('highest_price').cast(DecimalType(10, 3))) \
        .withColumn('menus_appeared', F.col('menus_appeared').cast("int")) \
        .withColumn('uuid' , F.col('uuid').cast("string"))

In [None]:
df = df.dropna()

DataFrame[menu_id: int, menu_page_id: int, dish_id: int, id: int, price: double, high_price: double, created_at: timestamp, updated_at: timestamp, xpos: double, ypos: double, dish_name: string, description: string, menus_appeared: int, times_appeared: int, first_appeared: int, last_appeared: int, lowest_price: string, highest_price: decimal(10,3), page_number: int, image_id: string, full_height: int, full_width: int, uuid: string, name: string, sponsor: string, event: string, venue: string, place: string, physical_description: string, occasion: string, notes: string, call_number: string, keywords: string, language: string, date: date, location: string, location_type: string, currency: string, currency_symbol: string, status: string, page_count: int, dish_count: int]

In [41]:
df.printSchema()

root
 |-- menu_id: integer (nullable = true)
 |-- menu_page_id: integer (nullable = true)
 |-- dish_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- high_price: double (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- xpos: double (nullable = true)
 |-- ypos: double (nullable = true)
 |-- dish_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- menus_appeared: integer (nullable = true)
 |-- times_appeared: integer (nullable = true)
 |-- first_appeared: integer (nullable = true)
 |-- last_appeared: integer (nullable = true)
 |-- lowest_price: string (nullable = true)
 |-- highest_price: decimal(10,3) (nullable = true)
 |-- page_number: integer (nullable = true)
 |-- image_id: string (nullable = true)
 |-- full_height: integer (nullable = true)
 |-- full_width: integer (nullable = true)
 |-- uuid: string (nullable = true)
 |-- name: string (n

In [42]:
df.show(1)



+-------+------------+-------+------+-----+----------+-------------------+-------------------+--------+--------+----------------+-----------+--------------+--------------+--------------+-------------+------------+-------------+-----------+----------+-----------+----------+--------------------+----+--------------------+-----+----------+--------------------+--------------------+--------+--------------------+-----------+--------+--------+----+--------------------+-------------+--------+---------------+------------+----------+----------+
|menu_id|menu_page_id|dish_id|    id|price|high_price|         created_at|         updated_at|    xpos|    ypos|       dish_name|description|menus_appeared|times_appeared|first_appeared|last_appeared|lowest_price|highest_price|page_number|  image_id|full_height|full_width|                uuid|name|             sponsor|event|     venue|               place|physical_description|occasion|               notes|call_number|keywords|language|date|            loca

                                                                                

In [None]:
# df.write.format("delta").save("s3a://vdt2025/menu_silver")