In [None]:
from pyspark.sql import SparkSession

In [None]:
# Create Spark Session
spark = SparkSession.builder.master('local').appName('app').getOrCreate()

# Read trade partition
trade_common = spark.read.parquet("/output_dir/partition=T")

# Read quote partition
quote_common = spark.read.parquet("/output_dir/partition=Q")

# Select some columns
trade = trade_common.select("trade_dt", "symbol", "exchange", "event_tm", "event_seq_nb", "file_tm", "bid_pr")

quote = quote_common.select("trade_dt", "symbol", "exchange", "event_tm", "event_seq_nb", "file_tm", "bid_pr")

In [None]:
trade.show()

In [None]:
quote.show()

In [None]:
from pyspark.sql import functions as func

def applyLatest(df):
    """
    You can uniquely identify a record by the combination of trade_dt, symbol, exchange, event_tm, event_seq_nb. 
    However, the exchange may correct an error in any submitted record by sending a new record with the same uniqueID. 
    Such records will come with later "event_tm" (arrival time). 
    You must ensure you only accept the one with the most recent "event_tm".
    
    Logic explanation:
    - If there are multiple records that have the same uniqueID (in this case, it is "trade_dt"), e will just take the one 
    with the most recent "event_tm".
    => we will use groupBy and agg functions in order to achieve the result.
    
    :param: Spark Dataframe
    :return: After aggr
    """
    recent_trade_quote = df.groupBy("trade_dt").agg(func.max("event_tm").alias("max_arrival_time"))
    return recent_trade_quote


In [None]:
# Use the method
trade_corrected = applyLatest(trade)

quote_corrected = applyLatest(quote)

In [None]:
trade_corrected.show()

In [None]:
# Specify some variables to mount to blob storage
container_name = "####"
storage_name = "####"
mount_dir = "/mnt/data"
key = "####"


dbutils.fs.mount(
  source = "wasbs://%s@%s.blob.core.windows.net" %(container_name, storage_name),
  mount_point = mount_dir,
  extra_configs = {"fs.azure.account.key.%s.blob.core.windows.net" % (storage_name): key })

#dbutils.fs.unmount("/mnt/data")

In [None]:
# Write datasets to parquet on Blob Storage

trade_date = "2020-07-29"
trade_corrected.write.parquet("/mnt/data/cloud-storage-path/trade/trade_dt={}".format(trade_date))

quote_date = "2020-07-29"
quote_corrected.write.parquet("/mnt/data/cloud-storage-path/quote/quote_dt={}".format(quote_date))

In [None]:
# END