##Create table for OLTP "payments" data

In [0]:
from pyspark.sql.functions import spark_partition_id, col
from functools import reduce

In [0]:
raw_payments_df = spark.read.format('delta').load("/lakehouse/bronze/payments")

In [0]:
raw_payments_df.count()

1946607

In [0]:
#Checking data skew
raw_payments_df.groupBy(spark_partition_id().alias("Core")).count().show()

+----+------+
|Core| count|
+----+------+
|   0|539255|
|   1|525206|
|   2|510264|
|   3|371882|
+----+------+



In [0]:
#Renaming Columns
ColumnNames = ["payment_id", "date", "amount", "rider_id"]
rawColumnNames = raw_payments_df.schema.names

payments_df = reduce(lambda df, _: df.withColumnRenamed(rawColumnNames[_], ColumnNames[_]), range(len(ColumnNames)), raw_payments_df)
payments_df.printSchema()

root
 |-- payment_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- rider_id: string (nullable = true)



In [0]:
# Retyping Columns
payments_df = payments_df\
    .withColumn("payment_id", col("payment_id").cast("Integer"))\
    .withColumn("date", col("date").cast("Date"))\
    .withColumn("amount", col("amount").cast("Decimal(10,2)"))\
    .withColumn("rider_id", col("rider_id").cast("Integer"))    
payments_df.printSchema()

root
 |-- payment_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- rider_id: integer (nullable = true)



In [0]:
# checking if no rows were lost in the last transformation
payments_df.count() == raw_payments_df.count()

True

In [0]:
payments_df.createOrReplaceTempView('oltp_payments')

In [0]:
# Checking for Null values
display(spark.sql('''
          SELECT 
            payment_id, date, amount, rider_id
          FROM 
            oltp_payments
          WHERE
            date IS NULL
            OR
            amount IS NULL
            OR
            rider_id IS NULL

        '''))

payment_id,date,amount,rider_id


In [0]:
#checking for duplicates
display(spark.sql(''' 
                  SELECT 
                    payment_id 
                  FROM 
                    oltp_payments 
                  GROUP BY payment_id
                  HAVING count(payment_id) > 1  
                  '''
                ))

payment_id


In [0]:
# Min Max of date
display(spark.sql(''' 
                  SELECT 
                    min(date), max(date)
                  FROM 
                    oltp_payments 
                  '''
                ))

min(date),max(date)
2013-02-01,2022-02-01


In [0]:
# Min Max of amount
display(spark.sql(''' 
                  SELECT 
                    min(amount), max(amount)
                  FROM 
                    oltp_payments 
                  '''
                ))

min(amount),max(amount)
3.0,25.0


In [0]:
# saving in silver as delta 
payments_df.write.format("delta").mode("overwrite").save("/lakehouse/silver/oltp_payments")


In [0]:
# create a delta table
spark.read.format("delta").load("/lakehouse/silver/oltp_payments").write.format("delta").mode("overwrite").saveAsTable("silver_payments")