In [1]:
import pyspark
import os
import pyspark.sql.functions as F
from dotenv import load_dotenv
from delta import * 
from pyspark.sql.types import DecimalType

In [2]:
load_dotenv('.././.env')
access_key = os.getenv("API_KEY")
secret_key = os.getenv("SECRET_ACCESS_KEY")
print(access_key)

AKIAZQ3DTLU6MHKI5HLC


In [3]:
conf = (
    pyspark.conf.SparkConf()
    .setAppName("MY_APP")
    .set(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .set("spark.hadoop.fs.s3a.access.key", access_key)
    .set("spark.hadoop.fs.s3a.secret.key", secret_key)
    .set("spark.hadoop.fs.s3a.region", "ap-southeast-2")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.sql.parquet.datetimeRebaseModeInWrite", "LEGACY")
    .set("spark.sql.shuffle.partitions", "4")
    .setMaster(
        "local[*]"
    )  # replace the * with your desired number of cores. * for use all.
)

extra_packages = [
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "org.apache.hadoop:hadoop-common:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
]

builder = pyspark.sql.SparkSession.builder.appName("MyApp").config(conf=conf)
spark = configure_spark_with_delta_pip(
    builder, extra_packages=extra_packages
).getOrCreate()

25/06/18 11:20:21 WARN Utils: Your hostname, duc-Inspiron-15-5510 resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlp0s20f3)
25/06/18 11:20:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/duc/.ivy2/cache
The jars for the packages stored in: /home/duc/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a39467df-2153-40a1-bd41-4f7dd60b3bba;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.3.1 in central
	found io.delta#delta-storage;3.3.1 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-common;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-protobuf_3_7;1.1.1 in central
	found org.apache.hadoop#hadoop-annotations;3.3.4 in central
	found org.apache.hadoop.thirdparty#h

In [4]:
df = spark.read.format("delta").load("s3a://vdt2025/menu_bronze")

25/06/18 11:20:47 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [5]:
df.printSchema()

root
 |-- menu_id: integer (nullable = true)
 |-- menu_page_id: integer (nullable = true)
 |-- dish_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- high_price: double (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- xpos: double (nullable = true)
 |-- ypos: double (nullable = true)
 |-- dish_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- menus_appeared: string (nullable = true)
 |-- times_appeared: string (nullable = true)
 |-- first_appeared: string (nullable = true)
 |-- last_appeared: string (nullable = true)
 |-- lowest_price: string (nullable = true)
 |-- highest_price: string (nullable = true)
 |-- page_number: integer (nullable = true)
 |-- image_id: string (nullable = true)
 |-- full_height: integer (nullable = true)
 |-- full_width: integer (nullable = true)
 |-- uuid: string (nullable = true)
 |-- name: string (nullable = t

In [6]:
df = df.withColumn('date', F.to_date("date", "yyyy-MM-dd")) \
        .withColumn('dish_count', F.col('dish_count').cast("int")) \
        .withColumn('page_count', F.col('page_count').cast("int")) \
        .withColumn('last_appeared' , F.col('last_appeared').cast("int")) \
        .withColumn('first_appeared' , F.col('first_appeared').cast("int")) \
        .withColumn('times_appeared' , F.col('times_appeared').cast("int")) \
        .withColumn('highest_price' , F.col('highest_price').cast(DecimalType(10, 3))) \
        .withColumn('menus_appeared', F.col('menus_appeared').cast("int")) \
        .withColumn('uuid' , F.col('uuid').cast("string"))

In [7]:
df = df.withColumn('date',F.when(F.col('date').rlike(r"^\d{4}-\d{2}-\d{2}$"), F.col('date')).otherwise(None))

In [8]:
df.count()

25/06/18 11:21:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

1329175

In [9]:
df = df.fillna(0, subset=['dish_count', 'page_count', 'times_appeared', 'menus_appeared', 'first_appeared', 'last_appeared'])

In [10]:
df = df.fillna('Unknown', subset=['dish_name', 'name', 'sponsor', 'event', 'venue', 'place', 'location', 'location_type', 'currency', 'currency_symbol', 'status', 'language'])
df = df.fillna('', subset=['description', 'physical_description', 'occasion', 'notes', 'call_number', 'keywords'])
df = df.fillna('N/A_UUID', subset=['uuid'])
df = df.fillna('N/A_ImageID', subset=['image_id'])


In [11]:
df.count()

                                                                                

1329175

In [12]:
df.write.format("delta").mode('overwrite').save("s3a://vdt2025/menu_silver")

25/06/18 11:21:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                