In [0]:
# GUIDED CAPSTONE STEP 4: THIS NOTEBOOK TAKES THE PARQUET FILES IN BLOB STORAGE AND PROVIDES SOME METRICS (ANALYTICAL ETL)
# GOAL: TAKE THE QUOTES FOR TODAY, APPEND PREVIOUS DAY'S LAST TRADE PRICE AND LAST TRADE PRICE FROM TODAY

In [0]:
# packages used
import math
import os
from pyspark.sql.functions import col, row_number, desc, concat, rank
from pyspark.sql.window import Window

In [0]:
# Azure credentials
storageAccountName = 'saderekguidedcapstone'
storageAccountAccessKey = <storageAccountAccessKey>
blobContainerName = 'container1'

spark.conf.set(
    f'fs.azure.account.key.{storageAccountName}.blob.core.windows.net',
    storageAccountAccessKey
)

In [0]:
# SKIP
def calculate_trade_moving_averages(exchange, date):
  # exchange = 'nyse', 'nasdaq'
  # date = '20200805', '20200806'

  # read parquet file into dataframe
  df = spark.read.parquet('wasbs://{}@{}.blob.core.windows.net/data/output/{}/trade/trade_dt={}'.format(blobContainerName, storageAccountName, exchange, date))
  
  # write temporary Spark view
  df.select(col('symbol'), col('exchange'), col('event_tm'), col('event_seq_nb'), col('trade_pr')).createOrReplaceTempView('tmp_trade_moving_avg')
  
  # calculate 30-min moving averages
  mov_avg_df = spark.sql('SELECT *, AVG(trade_pr) OVER(PARTITION BY symbol ORDER BY event_tm RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW) AS mov_avg_trade_pr FROM tmp_trade_moving_avg')
  
  # write Hive staging table
  mov_avg_df.write.saveAsTable('temp_trade_moving_avg')
  
  return

In [0]:
# testing
spark.sql('SHOW TABLES').show()

In [0]:
# 1/3
def create_table_todays_quotes(exchange, date):
  # exchange = 'nyse', 'nasdaq'
  # date = '20200805', '20200806'

  # read parquet file into dataframe
  df = spark.read.parquet('wasbs://{}@{}.blob.core.windows.net/data/output/{}/quote/quote_dt={}'.format(blobContainerName, storageAccountName, exchange, date))

  # append event_seq_nb from related trade
  df2 = df.withColumn('event_seq_nb_tr_temp', df.event_seq_nb/10)
  df3 = df2.withColumn('event_seq_nb_tr', df2.event_seq_nb_tr_temp.cast('int')*10).drop('event_seq_nb_tr_temp')
  
  # write Hive staging table
  df3.write.saveAsTable('temp_todays_quotes')
  
  return

create_table_todays_quotes('nyse', '20200806')
spark.sql('SELECT * FROM temp_todays_quotes').show()

In [0]:
# 2/3
def create_table_todays_trades(exchange, date):
  # exchange = 'nyse', 'nasdaq'
  # date = '20200805', '20200806'

  # read parquet file into dataframe
  df = spark.read.parquet('wasbs://{}@{}.blob.core.windows.net/data/output/{}/trade/trade_dt={}'.format(blobContainerName, storageAccountName, exchange, date))

  # write Hive staging table
  df.write.saveAsTable('temp_todays_trades')
  
  return

create_table_todays_trades('nyse', '20200806')
spark.sql('SELECT * FROM temp_todays_trades').show()

In [0]:
# 3/3
def create_table_prev_day_last_trades(exchange, date):
  # pull previous day's trades
  df = spark.read.parquet('wasbs://{}@{}.blob.core.windows.net/data/output/{}/trade/trade_dt={}'.format(blobContainerName, storageAccountName, exchange, str(int(date) - 1)))
  
  # write them to a view
  df.createOrReplaceTempView('tmp_previous_day_trades')
  
  # filter for the last trade within each symbol
  prev_day_last_trades = spark.sql('SELECT trade_dt, symbol, exchange, event_tm, event_seq_nb, arrival_tm, trade_pr, trade_size FROM (SELECT *, ROW_NUMBER() OVER(PARTITION BY symbol ORDER BY event_seq_nb DESC) AS rn FROM tmp_previous_day_trades) WHERE rn=1')
  
  # write the last trades to a table
  prev_day_last_trades.write.saveAsTable('temp_prev_day_last_trades')
  
  return

create_table_prev_day_last_trades('nyse', '20200806')
spark.sql('SELECT * FROM temp_prev_day_last_trades').show()

In [0]:
def write_analytical(exchange, date):
  df = spark.sql('''
  SELECT q.trade_dt, q.exchange, q.symbol, q.event_tm, q.event_seq_nb, q.bid_pr, q.bid_size, q.ask_pr, q.ask_size, tp.trade_pr AS prev_day_last_trade, t.trade_pr AS today_last_trade
  FROM temp_todays_quotes AS q
  INNER JOIN temp_prev_day_last_trades AS tp USING(symbol)
  LEFT JOIN temp_todays_trades AS t ON q.event_seq_nb_tr=t.event_seq_nb AND q.symbol=t.symbol
  ''')

  df.write.parquet('wasbs://{}@{}.blob.core.windows.net/analytical/{}/date={}'.format(blobContainerName, storageAccountName, exchange, date))

  return

write_analytical('nyse', '20200806')
write_analytical('nasdaq', '20200806')