In [0]:
import pyspark
import json
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from datetime import datetime

In [0]:
common_event = StructType([StructField('trade_dt', DateType(), True),
                           StructField('rec_type', StringType(), True),
                           StructField('symbol', StringType(), True),
                           StructField('exchange', StringType(), True),
                           StructField('event_tm', TimestampType(), True),
                           StructField('event_seq_nb', IntegerType(), True),
                           StructField('arrival_tm', TimestampType(), True),
                           StructField('trade_pr', FloatType(), True),
                           StructField('bid_pr', FloatType(), True),
                           StructField('bid_size', IntegerType(), True),
                           StructField('ask_pr', FloatType(), True),
                           StructField('ask_size', IntegerType(), True),
                           StructField('partition', StringType(), True)])

In [0]:
def parse_csv(line):
    record_type_pos = 2
    record = line.split(",")

    try:
        if record[record_type_pos] == 'T':
              return (datetime.strptime(record[0], '%Y-%m-%d').date(), 
                      record[2], 
                      record[3], 
                      record[6], 
                      datetime.strptime(record[4], '%Y-%m-%d %H:%M:%S.%f'), 
                      int(record[5]), 
                      datetime.strptime(record[1], '%Y-%m-%d %H:%M:%S.%f'), 
                      float(record[7]), 
                      None, 
                      None, 
                      None, 
                      None, 
                      record[2])
        elif record[record_type_pos] == 'Q':
              return (datetime.strptime(record[0], '%Y-%m-%d').date(), 
                      record[2], 
                      record[3], 
                      record[6], 
                      datetime.strptime(record[4], '%Y-%m-%d %H:%M:%S.%f'), 
                      int(record[5]), 
                      datetime.strptime(record[1], '%Y-%m-%d %H:%M:%S.%f'), 
                      None, 
                      float(record[7]), 
                      int(record[8]), 
                      float(record[9]), 
                      int(record[10]), 
                      record[2])
    except:
        return (None, None, None, None, None, None, None, None, None, None, None, None, 'B')

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
spark.conf.set(
  "fs.azure.account.key.somestorage.blob.core.windows.net",
  "somekey")

raw_csv_05 = spark.sparkContext.textFile("wasbs://somecontainer@somestorage.blob.core.windows.net/csv/2020-08-05/NYSE")
parsed_csv_05 = raw_csv_05.map(lambda line: parse_csv(line))
csv_data_05 = spark.createDataFrame(parsed_csv_05, schema=common_event)

raw_csv_06 = spark.sparkContext.textFile("wasbs://somecontainer@somestorage.blob.core.windows.net/csv/2020-08-06/NYSE")
parsed_csv_06 = raw_csv_06.map(lambda line: parse_csv(line))
csv_data_06 = spark.createDataFrame(parsed_csv_06, schema=common_event)

csv_data = csv_data_05.union(csv_data_06)

In [0]:
csv_data.show()

In [0]:
def parse_json(line):
    line = json.loads(line)
    record_type = line['event_type']

    try:
        if record_type == 'T':
              return (datetime.strptime(line['trade_dt'], '%Y-%m-%d').date(), 
                      record_type, 
                      line['symbol'], 
                      line['exchange'], 
                      datetime.strptime(line['event_tm'], '%Y-%m-%d %H:%M:%S.%f'), 
                      line['event_seq_nb'], 
                      datetime.strptime(line['file_tm'], '%Y-%m-%d %H:%M:%S.%f'), 
                      line['price'], 
                      None, 
                      None, 
                      None, 
                      None, 
                      record_type)
        elif record_type == 'Q':
              return (datetime.strptime(line['trade_dt'], '%Y-%m-%d').date(), 
                      record_type, 
                      line['symbol'], 
                      line['exchange'], 
                      datetime.strptime(line['event_tm'], '%Y-%m-%d %H:%M:%S.%f'), 
                      line['event_seq_nb'], 
                      datetime.strptime(line['file_tm'], '%Y-%m-%d %H:%M:%S.%f'), 
                      None, 
                      line['bid_pr'], 
                      line['bid_size'], 
                      line['ask_pr'], 
                      line['ask_size'], 
                      record_type)
    except:
        return (None, None, None, None, None, None, None, None, None, None, None, None, 'B')

In [0]:
raw_json_05 = spark.sparkContext.textFile("wasbs://somecontainer@somestorage.blob.core.windows.net/json/2020-08-05/NASDAQ")
parsed_json_05 = raw_json_05.map(lambda line: parse_json(line))
json_data_05 = spark.createDataFrame(parsed_json_05, schema=common_event)

raw_json_06 = spark.sparkContext.textFile("wasbs://somecontainer@somestorage.blob.core.windows.net/json/2020-08-06/NASDAQ")
parsed_json_06 = raw_json_06.map(lambda line: parse_json(line))
json_data_06 = spark.createDataFrame(parsed_json_06, schema=common_event)

json_data = json_data_05.union(json_data_06)

In [0]:
json_data.show()

In [0]:
data = csv_data.union(json_data)

In [0]:
data.printSchema()

In [0]:
data.write.partitionBy("partition").mode("overwrite").parquet("wasbs://somecontainer@somestorage.blob.core.windows.net/output_dir")