In [0]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
# Set Azure blob configuration
spark.conf.set(
  "somecontainer@somestorage",
  "somekey")

# Read parquet and create dataframe
trade_common = spark.read.format('parquet').load("wasbs://somecontainer@somestorage.blob.core.windows.net/output_dir/partition=T")

In [0]:
# Retrieve relevant fields
trade = trade_common.select("trade_dt", "rec_type", "symbol", "exchange", "event_tm", "event_seq_nb", "arrival_tm", "trade_pr")

In [0]:
trade.show(10)

In [0]:
def apply_latest(df):
  
    if df.first()["rec_type"] == "T":
    
        # Group records based on latest trade date
        df_grouped = df.groupBy("trade_dt", "rec_type", "symbol", "arrival_tm", "event_seq_nb").agg(max("event_tm").alias("latest_trade"))
    
        # Join with original dataframe to retrieve exchange and trade_pr for latest trade date
        df_joined = df_grouped.join(df.select("event_tm", "exchange", "trade_pr"), df.event_tm == df_grouped.latest_trade, "inner")

        # Retrieve relevant fields
        df_final = df_joined.select("trade_dt", "rec_type", col("symbol").alias("stock_symbol"), col("exchange").alias("stock_exchange"), "latest_trade", "event_seq_nb", "arrival_tm", "trade_pr").orderBy("trade_dt", "symbol", "event_seq_nb")

        return df_final
  
    elif df.first()["rec_type"] == "Q":
  
        # Group records based on latest trade date
        df_grouped = df.groupBy("trade_dt", "rec_type", "symbol", "arrival_tm", "event_seq_nb").agg(max("event_tm").alias("latest_quote"))
    
        # Join with original dataframe to retrieve exchange, bid_pr, bid_size, ask_pr and ask_size
        df_joined = df_grouped.join(df.select("event_tm", "exchange", "bid_pr", "bid_size", "ask_pr", "ask_size"), df.event_tm == df_grouped.latest_quote, "inner")
    
        # Retrieve relevant fields
        df_final = df_joined.select("trade_dt", "rec_type", col("symbol").alias("stock_symbol"), col("exchange").alias("stock_exchange"), "latest_quote", "event_seq_nb", "arrival_tm", "bid_pr", "bid_size", "ask_pr", "ask_size").orderBy("trade_dt", "symbol", "event_seq_nb")
    
        return df_final

In [0]:
trade_corrected = apply_latest(trade)

In [0]:
trade_corrected.show(10)

In [0]:
# Separate dataframes by trade date
trade_corrected_05 = trade_corrected.where(trade_corrected.trade_dt == "2020-08-05")
trade_corrected_06 = trade_corrected.where(trade_corrected.trade_dt == "2020-08-06")

In [0]:
trade_corrected_05.show()

In [0]:
# Write to Azure blob in parquet format
trade_corrected_05.write.parquet("wasbs://somecontainer@somestorage.blob.core.windows.net/trade/trade_dt={}".format('2020-08-05'))
trade_corrected_06.write.parquet("wasbs://somecontainer@somestorage.blob.core.windows.net/trade/trade_dt={}".format('2020-08-06'))

In [0]:
# Set Azure blob configuration
spark.conf.set(
  "somecontainer@somestorage",
  "somekey")

# Read parquet and create dataframe
quote_common = spark.read.format('parquet').load("wasbs://somecontainer@somestorage.blob.core.windows.net/output_dir/partition=Q")

In [0]:
# Retrieve relevant fields
quote = quote_common.select("trade_dt", "rec_type", "symbol", "exchange", "event_tm", "event_seq_nb", "arrival_tm", "bid_pr", "bid_size", "ask_pr", "ask_size")

In [0]:
quote.show(10)

In [0]:
quote_corrected = apply_latest(quote)

In [0]:
quote_corrected.show(10)

In [0]:
# Separate dataframes by trade date
quote_corrected_05 = quote_corrected.where(quote_corrected.trade_dt == "2020-08-05")
quote_corrected_06 = quote_corrected.where(quote_corrected.trade_dt == "2020-08-06")

In [0]:
# Write to Azure blob in parquet format
quote_corrected_05.write.parquet("wasbs://somecontainer@somestorage.blob.core.windows.net/quote/trade_dt={}".format('2020-08-05'))
quote_corrected_06.write.parquet("wasbs://somecontainer@somestorage.blob.core.windows.net/quote/trade_dt={}".format('2020-08-06'))