In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("goldlayer").getOrCreate()

If you are using the same workspace you will need to mount the silver layer again. Since this notebook is in the same workspace as the Silver_notebook the silverlayer container is already mounted

In [0]:
%fs ls dbfs:/mnt/silver/

####Load

Reading the data from Silverlayer container

In [0]:
# Read the Parquet file into a DataFrame
df = spark.read.format("parquet").load("dbfs:/mnt/silver/online_retail_parquet/")

In [0]:
df.show(10)

####Transform

Performing Aggregations:
Since each item of invoices are listed as a separate row we need to group the rows by InvoiceID

In [0]:
from pyspark.sql.functions import first

df_grouped = df.groupBy("InvoiceNo").agg(
    first("CustomerID").alias("CustomerID"),
    first("Country").alias("Country"),
    first("customer_count").alias("customer_count"),
    first("Flag_reorder").alias("Flag_reorder")
)

df_grouped.show()

In [0]:
df_customer_only = df.groupBy("CustomerID").agg(
    first("Country").alias("Country"),
    first("customer_count").alias("customer_count"),
    first("Flag_reorder").alias("Flag_reorder")
)

df_customer_only.show()

Finding out the customer retention

In [0]:
from pyspark.sql.functions import col

#Customers who have placed only 1 order
customers_one = df_customer_only.filter(col("Flag_reorder") == 0).count()
print(f"Number of customers with only 1 order placed: {customers_one}")

#Customers who have placed multiple orders
customers_multi = df_customer_only.filter(col("Flag_reorder") == 1).count()
print(f"Number of customers with multiple orders placed: {customers_multi}")

In [0]:
#Percentage of returning customers
percentage = (customers_multi / (customers_one + customers_multi)) * 100
print(f"Customer Retention rate : {percentage:.2f}%")

Calculating the Churn Rate

Customer churn rate, also known as customer attrition rate, is a metric that measures the percentage of customers who stop using a company's product or service over a given period of time. It is commonly used by businesses to evaluate customer retention and satisfaction.

The formula for calculating the churn rate is:

Churn Rate = (Number of customers lost/Total Numbe of customers)*100

In [0]:
churn_rate = (customers_one / (customers_one + customers_multi)) * 100
print(f"The Churn Rate for the store is : {churn_rate:.2f}%")

Aggregating and storing data in 2 seperate df based on single time customer or a returning customer

In [0]:
df_single = df.filter(col("Flag_reorder") == 0)
df_returning = df.filter(col("Flag_reorder") == 1)

####Load

Saving the processed data into the goldlayer container

In [0]:
#Saving this data in our gold layer container
#mounting
dbutils.fs.mount(
    source="wasbs://goldlayer@anshstorageacc1.blob.core.windows.net/",
    mount_point="/mnt/gold",
    extra_configs={f"fs.azure.account.key.anshstorageacc1.blob.core.windows.net":<ACCESS KEY>}
)

In [0]:
df.write.format("parquet").mode("overwrite").save("/mnt/gold/all_data_paraquet")
df_grouped.write.format("parquet").mode("overwrite").save("/mnt/gold/invoices_grouped_paraquet")
df_customer_only.write.format("parquet").mode("overwrite").save("/mnt/gold/customer_only_paraquet")