In [0]:
# GUIDED CAPSTONE STEP 3: THIS NOTEBOOK TAKES THE PARQUET FILES IN THE DATABRICKS CLUSTER AND WRITES THEM BACK TO AZURE BLOB STORAGE

In [0]:
# packages used
import os
from pyspark.sql.functions import col, row_number, desc, concat, rank
from pyspark.sql.window import Window

In [0]:
# recall: examples of some of the parquet files
print(os.listdir('/dbfs/output/'))
print(os.listdir('/dbfs/output/nyse'))
print(os.listdir('/dbfs/output/nyse/20200805'))
print(os.listdir('/dbfs/output/nyse/20200805/partition=T'))

In [0]:
# Azure credentials
storageAccountName = 'saderekguidedcapstone'
storageAccountAccessKey = '<STORAGE-ACCOUNT-ACCESS-KEY'
blobContainerName = 'container1'

spark.conf.set(
    f'fs.azure.account.key.{storageAccountName}.blob.core.windows.net',
    storageAccountAccessKey
)

In [0]:
# data correction for trade/quote data
# it's possible for the exchange to resend data for the same trade_dt/symbol/exchange/event_tm/event_seq_nb in a different row but with a later arrival_tm
# if this is the case, use the record with the latest arrival_tm
# https://stackoverflow.com/questions/54921359/concat-multiple-columns-of-a-dataframe-using-pyspark
# https://hendra-herviawan.github.io/pyspark-groupby-and-aggregate-functions.html
# https://sparkbyexamples.com/pyspark/pyspark-window-functions/
def applyLatest(df):
  col_list = ['trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb']
  df = df.withColumn(colName='unique_identifier', col=concat(*col_list))

  windowSpec = Window.partitionBy(df['unique_identifier']).orderBy(df['arrival_tm'].desc())
  return df.withColumn('rank', row_number().over(windowSpec)) \
    .where('rank == 1') \
    .drop('rank', 'unique_identifier')

In [0]:
# test applyLatest
test_schema = ['trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr', 'trade_size']
test_data = [
  ("2020-08-06","SYMA","NASDAQ","2020-08-06 10:42:21.079",10,"2020-08-06 09:30:00.000",78.93245610745132,368),
  ("2020-08-06","SYMA","NASDAQ","2020-08-06 10:42:21.079",10,"2020-08-07 09:30:00.000",50.654654,500)
]
df = spark.createDataFrame(test_data, schema=test_schema)
df.show()
applyLatest(df).show()

In [0]:
def write_parquet_to_blob(exchange, partition, date):
  # exchange = 'nyse', 'nasdaq'
  # partition = 'T' (trade), 'Q' (quote), 'B' (bad data)
  # date = '20200805', '20200806'

  # read partitioned parquet
  df0 = spark.read.parquet('/output/{}/{}/partition={}'.format(exchange, date, partition))

  # remove some columns and apply latest correction
  if partition == 'T':
    df1 = df0.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr', 'trade_size')
    df2 = applyLatest(df1)
    subfolder = 'trade'
  elif partition == 'Q':
    df1 = df0.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'bid_pr', 'bid_size', 'ask_pr', 'ask_size')
    df2 = applyLatest(df1)
    subfolder = 'quote'
  elif partition == 'B':
    df2 = df0.select('bad_data')
    subfolder = 'bad_data'
  else:
    print('Provide a valid value for parameter partition. Acceptable values include "T" (trade), "Q" (quote), or "B" (bad data).')
    return

  # write to Blob Storage
  df2.write.parquet('wasbs://{}@{}.blob.core.windows.net/data/output/{}/{}/{}_dt={}'.format(blobContainerName, storageAccountName, exchange, subfolder, subfolder, date))
  return

In [0]:
# write all files to Blob Storage
for exchange in ('nyse', 'nasdaq'):
  for partition in ('T', 'Q'):
    for date in ('20200805', '20200806'):
      write_parquet_to_blob(exchange=exchange, partition=partition, date=date)