In [0]:
%run "../jobs/reader"

In [0]:
%run "../jobs/data_quality"

In [0]:
%run "../jobs/transformer"

In [0]:
%run "../jobs/aggregator"

In [0]:
%run "../jobs/utility"

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

spark = SparkSession.builder.getOrCreate()
reader = Reader(spark)
transformer = Transformer(spark)
dq = DataQuality(spark)
aggregator = Aggregator(spark)
utility = Utility(spark)


def main():
        print("starting e-commerce-pipeline")

        # Input data paths for datasets
        order_path = "dbfs:/FileStore/shared_uploads/abhi687303@gmail.com/Test/Orders.json"
        customer_data = "dbfs:/FileStore/shared_uploads/abhi687303@gmail.com/Test/Customer.xlsx"
        product_data = "dbfs:/FileStore/shared_uploads/abhi687303@gmail.com/Test/Products.csv"

        # Schema definition for datasets-
        #---------customer dataset ------------------
        customer_schema = StructType([
        StructField("Customer ID", StringType(), True),
        StructField("Customer Name", StringType(), True),
        StructField("email", StringType(), True),
        StructField("phone", StringType(), True),
        StructField("address", StringType(), True),
        StructField("Segment", StringType(), True),
        StructField("Country", StringType(), True),
        StructField("City", StringType(), True),
        StructField("State", StringType(), True),
        StructField("Postal Code", StringType(), True),
        StructField("Region", StringType(), True)
        ])

        #----------product dataset schema ----------
        product_schema = StructType([
        StructField("Product ID", StringType(), True),
        StructField("Category", StringType(), True),
        StructField("Sub-Category", StringType(), True),
        StructField("Product Name", StringType(), True),
        StructField("State", StringType(), True),
        StructField("Price per product", DoubleType(), True)
        ])

        #-------------Order dataset schema ----------- 
        order_schema = StructType([
        StructField("Row ID", IntegerType(), True),
        StructField("Order ID", StringType(), True),
        StructField("Order Date", StringType(), True),   
        StructField("Ship Date", StringType(), True),    
        StructField("Ship Mode", StringType(), True),
        StructField("Customer ID", StringType(), True),
        StructField("Product ID", StringType(), True),
        StructField("Quantity", IntegerType(), True),
        StructField("Price", DoubleType(), True),
        StructField("Discount", DoubleType(), True),
        StructField("Profit", DoubleType(), True)
        ])

        # Read input data 
        product_df = reader.read_csv(product_data,True, product_schema)
        customer_df = reader.read_excel(customer_data,True, customer_schema)
        order_df = reader.read_json(order_path,True, order_schema)
        # ----------space removal from column names --------------
        clean_customer_df = dq.standardize_column_names(customer_df)
        clean_order_df = dq.standardize_column_names(order_df)
        clean_product_df = dq.standardize_column_names(product_df)

        # #================== Task:1. raw table for each dataset ==========================
        # raw_customer_table = utility.create_delta_table(clean_customer_df, "raw_customer")
        # raw_order_table = utility.create_delta_table(clean_customer_df,"raw_order")
        # raw_product_table = utility.create_delta_table(clean_customer_df,"raw_product") 


        # -----------data quality checks on customer dataset---------------------
        valid_cust_df = dq.filter_not_null(clean_customer_df,["customer_id","country"])
        valid_cust_df = dq.clean_name_column(valid_cust_df,"customer_name")
        valid_cust_df = valid_cust_df.dropDuplicates(["customer_id"])
        valid_cust_df.show(2)

        # -----------data quality checks on order dataset------------------------

        valid_order_df = dq.filter_not_null(clean_order_df,["order_id","order_date","ship_date","customer_id","product_id","quantity",
                                                        "price","discount","profit"])
        valid_order_df = dq.filter_positive_values(valid_order_df,["quantity","price"])
        valid_order_df = dq.filter_valid_orders(valid_order_df,"order_date","ship_date")
        valid_order_df = valid_order_df.dropDuplicates(["order_id"])
        valid_order_df.show(2)

        #------------data quality checks on product dataset --------------------
        valid_product_df = dq.filter_not_null(clean_product_df,["product_id","category","sub_category","product_name","price_per_product"])
        valid_product_df = dq.filter_positive_values(valid_product_df,["price_per_product"])
        valid_product_df = valid_product_df.dropDuplicates(["product_id"])
        valid_product_df.show(2)

        enrich_customers_df = transformer.create_enriched_customer_table(valid_cust_df,valid_order_df,valid_product_df)
        enrich_customers_df.show(2)

        enrich_products_df = transformer.create_enriched_product_table(valid_cust_df,valid_order_df,valid_product_df)
        print("enrich product table")
        enrich_products_df.show(2)

        #================== Task:2.enrich table for customer and product=====================
        enrich_customer_table = utility.create_delta_table(enrich_customers_df, "enriched_customer")
        enrich_product_table = utility.create_delta_table(enrich_products_df, "enriched_product")

        
        valid_order_df = transformer.transform_orders(valid_order_df)
        valid_cust_df = valid_cust_df.withColumn("customer_id", F.trim(F.col("customer_id")))
        valid_order_df = valid_order_df.withColumn("customer_id", F.trim(F.col("customer_id")))
        enrich_orders = transformer.enrich_orders(valid_order_df,valid_cust_df,valid_product_df)

        #=================== Task:3.Final enrich table with all details======================
        enrich_table = utility.create_delta_table(enrich_orders, "enriched")
        spark.sql("select * from enriched limit 5").show()


        #=============== Task:4.Aggregated table various input categories ============================
        agg_profit_cat = aggregator.aggregate_profit(enrich_orders)
        aggregated_table = utility.create_delta_table(agg_profit_cat, "aggregated")
        print("aggregated profit by categories like year, product category, sub category and customer :")
        spark.sql("select * from aggregated limit 5").show()
        # agg_profit_cat.orderBy("year").show(5)


        #============= Task:5. Aggregated results using sqls ================================================
        print("----------Profit by Year------------------------------")
        year_query = """SELECT year,round(SUM(profit),2) AS total_profit FROM enriched GROUP BY year ORDER BY year"""
        aggregator.aggregate_with_query(year_query).show(5)
        #spark.sql("").show(5)

        print("----------Profit by Year + Product Category------------")
        query = "SELECT year,category,round(SUM(profit),2) AS total_profit FROM enriched GROUP BY year, category ORDER BY year"
        aggregator.aggregate_with_query(query).show(5)

        print("--------------Profit by Customer-------------------")
        query = """SELECT customer_id,customer_name, round(SUM(profit),2) AS total_profit FROM enriched GROUP BY customer_id, customer_name"""
        aggregator.aggregate_with_query(query).show(5)


        print("--------------Profit by Customer + Year-------------")
        query = """SELECT year, customer_id,customer_name,round(SUM(profit),2) AS total_profit FROM enriched GROUP BY year, customer_id, 
        customer_name ORDER BY year """
        aggregator.aggregate_with_query(query).show(5)


if __name__ == "__main__":
    main()

starting e-commerce-pipeline
+-----------+-------------+--------------------+-----------------+--------------------+--------+-------------+----------+-------+-----------+-------+
|customer_id|customer_name|               email|            phone|             address| segment|      country|      city|  state|postal_code| region|
+-----------+-------------+--------------------+-----------------+--------------------+--------+-------------+----------+-------+-----------+-------+
|   AA-10315|   Alex Avila|josephrice131@gma...|     680-261-2092|91773 Miller Shoa...|Consumer|United States|Round Rock|  Texas|      78664|Central|
|   AA-10375| Allen Armold|garymoore386@gmai...|221.945.4191x8872|6450 John Lodge\n...|Consumer|United States|   Atlanta|Georgia|      30318|  South|
+-----------+-------------+--------------------+-----------------+--------------------+--------+-------------+----------+-------+-----------+-------+
only showing top 2 rows

+------+--------------+----------+---------+--

In [0]:
# dbutils.fs.rm("dbfs:/user/hive/warehouse/product", recurse=True)
# dbutils.fs.rm("dbfs:/user/hive/warehouse/customer", recurse=True)
# dbutils.fs.rm("dbfs:/user/hive/warehouse/order", recurse=True)
# dbutils.fs.rm("dbfs:/user/hive/warehouse/enriched_orders", recurse=True)