In [None]:
import os
import json
import pandas as pd
from pathlib import Path

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType
import decimal
import logging


In [None]:
container_name = "####"
storage_name = "####"
key = "####"

In [None]:
# A set of functions

def parse_csv(line):
    """
   This function will parse ".txt" file from blob storage.

   since we are working with comma-separated values file so we want to return event object

   :param line: A line of ".txt" file in CSV format.
   :return: Common event object.
   """
    try:
        record_type_pos = 2 # filter for "Q" or "T" value
        record = line.split(",")
        # Filter by record_type (T = Trade or Q = Quote):
        if record[record_type_pos] == "T":
            # Create event object by using values from record object and performing data type conversion
            event = [record[0], # trade_dt
                     record[1], # file_tm
                     record[2], # event_type
                     record[3], # symbol
                     record[4], # event_tm
                     int(record[5]), # event_seq_nb
                     record[6],   # exchange
                     decimal.Decimal(record[7]), # bid_pr
                     int(record[8]), # bid_size
                     None, None,
                     "T"]
            return event
        elif record[record_type_pos] == "Q":
            event = [record[0], 
                     record[1], 
                     record[2], 
                     record[3], 
                     record[4], 
                     int(record[5]), 
                     record[6],
                     decimal.Decimal(record[7]), 
                     int(record[8]), 
                     decimal.Decimal(record[9]), # ask_pr
                     int(record[10]),            # ask_size
                     "Q"]
            return event
    except Exception as e:
        # Return exception as "Bad record" and convert values to None preceding record_type == "B"
        event = [None, None, None, None, None, None, None, None, None, None, None, "B"]
        logging.error("Bad record", e)
        # print(f"Bad record: {e}")
        return event
    
def parse_json(line):
    """
    This function will parse through each line in the JSON formatted ".txt" file stored from blob storage.

    :param line: Each line of ".txt" file in JSON format.
    :return: common_event() object
    """
    try:
        record = json.loads(line)
        record_type = record["event_type"]
        # Parse records for each type and convert data type as necessary
        # Filter by record_type (T = Trade or Q = Quote):
        if record_type == "T":
            # Create event object based and
            event = [record["trade_dt"], 
                     record["file_tm"], 
                     record["event_type"], 
                     record["symbol"],
                     record["event_tm"], 
                     int(record["event_seq_nb"]), 
                     record["exchange"],
                     decimal.Decimal(record["bid_pr"]), 
                     int(record["bid_size"]), 
                     None, None, # Try place None values
                     "T"]
            return event
        elif record_type == "Q":
            event = [record["trade_dt"], 
                     record["file_tm"], 
                     record["event_type"], 
                     record["symbol"],
                     record["event_tm"], 
                     int(record["event_seq_nb"]), 
                     record["exchange"],
                     decimal.Decimal(record["bid_pr"]), 
                     int(record["bid_size"]), 
                     decimal.Decimal(record["ask_pr"]),
                     int(record["ask_size"]), 
                     "Q"]
            return event
    except Exception as e:
        # Return exception as "Bad record" and convert values to None preceding record_type == "B"
        event = [None, None, None, None, None, None, None, None, None, None, None, "B"]
        logging.error("Bad record", e)
        # print(f"Bad record: {e}")
        return event

In [None]:
# Initialize schema

commonEventSchema = StructType([
            StructField("trade_dt", StringType(), True),
            StructField("file_tm", StringType(), True),
            StructField("record_type", StringType(), True),
            StructField("symbol", StringType(), True),
            StructField("event_tm", StringType(), True),
            StructField("event_seq_nb", IntegerType(), True),
            StructField("exchange", StringType(), True),
            StructField("bid_pr", DecimalType(), True),
            StructField("bid_size", IntegerType(), True),
            StructField("ask_pr", DecimalType(), True),
            StructField("ask_size", IntegerType(), True),
            StructField("partition", StringType(), True)

])

In [None]:
# csv path
csv_dir_1 = "/data/csv/2020-08-05/NYSE/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt"
csv_dir_2 = "/data/csv/2020-08-06/NYSE/part-00000-214fff0a-f408-466c-bb15-095cd8b648dc-c000.txt"

json_dir_1 = "/data/json/2020-08-05/NASDAQ/part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt"
json_dir_2 = "/data/json/2020-08-06/NASDAQ/part-00000-092ec1db-39ab-4079-9580-f7c7b516a283-c000.txt"


# Create Spark Session
spark = SparkSession.builder.master('local').appName('app').getOrCreate()

spark.conf.set(
        "fs.azure.account.key.%s.blob.core.windows.net" % (storage_name), 
         key
)

# Initilize spark context
sc = spark.sparkContext

# Raw text files
raw_csv_1 = sc.textFile( "wasbs://%s@%s.blob.core.windows.net%s" %( container_name, storage_name, csv_dir_1))

raw_csv_2 = sc.textFile( "wasbs://%s@%s.blob.core.windows.net%s" %( container_name, storage_name, csv_dir_2))

raw_json_1 = sc.textFile( "wasbs://%s@%s.blob.core.windows.net%s" %( container_name, storage_name, json_dir_1))

raw_json_2 =  sc.textFile( "wasbs://%s@%s.blob.core.windows.net%s" %( container_name, storage_name, json_dir_2))

# Parsed files
parsed_csv1 = raw_csv_1.map(lambda line: parse_csv(line))
parsed_csv2 = raw_csv_2.map(lambda line: parse_csv(line))

parsed_json1= raw_json_1.map(lambda line: parse_json(line))
parsed_json2= raw_json_2.map(lambda line: parse_json(line))

# Create Data Frames
spark_df1 = spark.createDataFrame(parsed_csv1, commonEventSchema)
spark_df2 = spark.createDataFrame(parsed_csv2, commonEventSchema)
spark_df3 = spark.createDataFrame(parsed_json1, commonEventSchema)
spark_df4 = spark.createDataFrame(parsed_json2, commonEventSchema)


In [None]:
spark_df1.limit(10).show()

In [None]:
spark_df4.limit(10).show()

In [None]:
# Check the distinct partition
spark_df1.select("partition").distinct().show()

In [None]:
# check the total rows of spark_df2 data frame
spark_df2.count()

In [None]:
union_df = spark_df1.union(spark_df2)\
                    .union(spark_df3)\
                    .union(spark_df4)

In [None]:
# Check the total rows of union data frame
union_df

In [None]:
# Check the distinct partition
union_df.select("partition").distinct().show()

In [None]:
# Write The Common Events Into Partitions As Parquet Files To HDFS
union_df.write.partitionBy("partition").mode("overwrite").parquet("output_dir")

In [None]:
union_df.show()