In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField,StringType,DoubleType,TimestampType, ArrayType,LongType,BinaryType
import json

In [0]:
def parse_value_to_row(value,schema):
    parsed_df = spark.createDataFrame([{"value":json.dumps(value)}])\
        .withColumn("parsed",F.from_json(F.col("value"),schema))\
        .select("parsed.*")
    return parsed_df.collect()[0].asDict()

def process_feed():
    feed = spark.sql(f"""
                     select *
                     from bookstore.bronze.feeds
                     where insert_ts > (
                         select max(insert_ts)
                         from bookstore.silver.books
                         )
                         """)

    insert_ts = F.from_utc_timestamp(F.current_timestamp(),'Asia/Kolkata')

    topic_map = {
        'books': ('bookstore.silver.books', schema_books),
        'customers': ('bookstore.silver.customers', schema_customers),
        'orders': ('bookstore.silver.orders', schema_orders)
    }

    for row in feed.collect():
        row_dict = row.asDict()
        topic = row_dict['topic']
        if topic in topic_map:
            table,schema = topic_map[topic]
            value = row_dict['value']
            parsed = parse_value_to_row(value,schema)
            row_dict.pop('topic',None)
            row_dict.pop('value',None)
            merged_dict = {**parsed,**row_dict}
            # spark.createDataFrame([merged_dict])\
            #     .write\
            #     .format('delta')\
            #     .mode('append')\
            #     .saveAsTable(table)

        else:
            quarantine = spark.createDataFrame([row_dict])
            quarantine \
                .write \
                .format('delta') \
                .mode('append') \

                .saveAsTable('bookstore.bronze.quarantines')

In [0]:
import sys
sys.path.append('/Workspace/Users/akshay.chidrawar@ltimindtree.com/bookstore/transformations')
from raw_to_bronze_input import schema_feeds,schema_books,schema_customers,schema_orders

def drop_columns_from_schema(schema, columns_to_drop):
    new_fields = [
        field for field in schema.fields
        if field.name not in columns_to_drop
    ]
    return StructType(new_fields)

topic_map = {
        'books': ('bookstore.silver.books',schema_books),
        'customers': ('bookstore.silver.customers',schema_customers),
        'orders': ('bookstore.silver.orders',schema_orders)
    }

if 1==1:
    feed = spark.sql(f"""
                    select *
                    from bookstore.bronze.feeds""").limit(10)
    for row in feed.collect():
        row_dict = row.asDict()
        topic = row_dict['topic']
        if topic in topic_map:
            table,schema = topic_map[topic]
            value = row_dict['value']
            parsed = parse_value_to_row(value,schema)
            row_dict.pop('topic',None)
            row_dict.pop('value',None)
            schema = drop_columns_from_schema(schema,['create_ts','value'])
            merged_dict = {**parsed,**row_dict}
            df = spark.createDataFrame([merged_dict],schema)
            display(df)


In [0]:
value={'order_id':'000000003996','order_timestamp':'2021-12-29 14:03:00','customer_id':'C00159','quantity':2,'total':73,'books':[{'book_id':'B09','quantity':1,'subtotal':24},{'book_id':'B01','quantity':1,'subtotal':49}]}
value_str = json.dumps(value)
display((value_str))

schema= StructType([
    StructField('order_id', StringType()),
    StructField('order_timestamp', TimestampType()),
    StructField('customer_id', StringType()),
    StructField('quantity', LongType()),
    StructField('total', LongType()),
    StructField('books', ArrayType(StructType([
        StructField('book_id', StringType()),
        StructField('quantity', LongType()),
        StructField('subtotal', LongType())
        ])))
])


df = spark.createDataFrame(
    [{"value": value_str}]
)

parsed = df.select(
    F.from_json(F.col("value"),schema).alias("parsed")
).select("parsed.*")

display(parsed_df)



In [0]:
from raw_to_bronze import *
from raw_to_bronze_input import *

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

feed = raw_feed(spark=spark,path_=path_feed,schema_=schema_feed)
books = topics_feed(feed=feed,name_='books',schema_=schema_books)
customers = topics_feed(feed=feed,name_='customers',schema_=schema_customers)
orders = topics_feed(feed=feed,name_='orders',schema_=schema_orders)

In [0]:
display(feed)
display(books)
display(customers)
display(orders)