## Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Configuring and mounting the data from azure blob to dbfs.

In [0]:
dbutils.fs.unmount("/mnt/shopping_data_dbfs")

/mnt/shopping_data_dbfs has been unmounted.


True

In [0]:
configs = {
  "fs.azure.account.key.shoppingcircle.blob.core.windows.net": "FPb7x/xxXfAxLDdqvUoxAPZWoD0XJJoa0m0eu0dVMTz407uAEhFimpwg5YCzX18bXplRA28PIJRh+AStP8VI1Q=="
}
dbutils.fs.mount(
  source = "wasbs://shopping-circle-data@shoppingcircle.blob.core.windows.net",
  mount_point = "/mnt/shopping_data_dbfs",
  extra_configs = configs
)

True

In [0]:
display(dbutils.fs.mounts())




## Reading datasets

In [0]:
file_path = "dbfs:/mnt/shopping_data_dbfs/Bronze-layer/BG-Thushara/capstone-azure-training/raw/main/Data/New_data/sample_inventory_data.csv"

# Read the CSV file using Spark DataFrame
inventory = spark.read.csv(file_path, header=True)  
display(inventory) 



In [0]:
file_path1 = "dbfs:/mnt/shopping_data_dbfs/Bronze-layer/BG-Thushara/capstone-azure-training/raw/main/Data/New_data/sample_sales_data.csv"

# Read the CSV file using Spark DataFrame
sales = spark.read.csv(file_path1, header=True)  
display(sales) 



## Exploring the data

In [0]:
print('inventory data :')
inventory.printSchema()
print('sales data :')
sales.printSchema()



##### correcting the data types with new defined schema


In [0]:
# inventory
schema_inventory = StructType([
    StructField('Date',DateType(),False),
    StructField('Product_ID',StringType(),True),
    StructField('Inventory',IntegerType(),True)
    ]
)

schema_sales = StructType([
    StructField('sale_id',StringType(),False),
    StructField('store_id',StringType(),False),
    StructField('customer_id',StringType(),False),
    StructField('product_id',StringType(),False),
    StructField('sale_date',DateType(),True),
    StructField('sale_amount',IntegerType(),True)
    ]
)



In [0]:
inventory = spark.read.csv(file_path, header=True,schema=schema_inventory)
sales = spark.read.csv(file_path1,header = True, schema = schema_sales)



In [0]:
print('inventory data :')
inventory.printSchema()
print('sales data :')
sales.printSchema()



##### To imporove the performace of the process (notebook execution) we will cache the data frames.
Caching helps in optimize performance by storing frequently accessed data in memory, reducing the need to recompute or reload the data, thereby improving query performance and reducing latency.

In [0]:
inventory.cache()
sales.cache()



In [0]:
inventory.display()



In [0]:
sales.display()



In [0]:
## inventory 
print('*****inventory data*****')
print('\nNo. of rows:',inventory.count(),'\nNo. of columns :',len(inventory.columns),'\n')

print('======================================')
## sales
print('\n*******sales data*******')
print('\nNo. of rows:',sales.count(),'\nNo. of columns :',len(sales.columns))



In [0]:
## inventory
inventory.summary().display()



#### Outcomes
- The Inventory value is normally distibuted. The data quality seems to be good.
- The range of inventory value is 50 - 1000.
- No missing values.

In [0]:
## sales
sales.summary().display()



#### Outcomes
- The sales value is normally distibuted. The data quality seems to be good.
- The range of inventory value is 10 - 1499.
- The average sales value is 747.
- There are 37 missing values in sales amount column.

In [0]:
sales.select('product_id').agg(mode('product_id')).show()



- This shows that the categorical columns are sorted on basis of ascii values but not mode or min frequent entries.

## Data cleaning

In [0]:
## count of records berfore dropping duplicates
print('inevntory data :',inventory.count())

print('sales data :',sales.count())




In [0]:
inventory = inventory.dropDuplicates()
sales = sales.dropDuplicates()



In [0]:
## count of records after dropping duplicates
print('inevntory data :',inventory.count())

print('sales data :',sales.count())



- We notice that there no duplicate entries in inventory data.
- There are 33 duplicate records present in sales data.

- As noticed earlier we have missing values in sales data.
- Let us check what percent of data has missing values.

In [0]:
total_records = sales.count()
for i in sales.columns:
    print(i)
    print((sales.where(col(i).isNull()).count()*100)/total_records)



- This explains that the sales amount field have about 0.37% of missing values.
- That is a very minimal number.
- We can simply drop those records.
- But here for exercising the data let's impute the data manually.

In [0]:
sales.filter(col('sale_amount').isNull()).display()



- From the data, the garnualrity is at sale_is, store_id, customer_id and product_id.
- The sale_id always be distinct.
- So to extact the trend of the data, let us put the average sale amount in place of null values, at customer,store and prodect level.
- Explaining in general, we are condiering a scenario that a customer regularly purchases a uniform quantity of that particular from that respective store.

In [0]:
non_null_df = sales.filter(~(col('sale_amount').isNull()))



In [0]:
nulls_df = sales.filter((col('sale_amount').isNull()))



In [0]:
non_null_agg = non_null_df.groupby('store_id','product_id','customer_id').agg(avg('sale_amount').alias('sale_amount'))



In [0]:
non_null_agg.display()



In [0]:
nulls_df.drop('sale_amount').join(non_null_agg,['store_id','product_id','customer_id'],'left').display()



In [0]:
sales = nulls_df.unionByName(non_null_df)



- since we are missing a few combinations in this method.
- let's replace the rest with entire data frames's average.

In [0]:
sales.agg(avg('sale_amount')).show()



In [0]:
avg = 746.95



In [0]:
sales = sales.fillna({'sale_amount':avg})



In [0]:
sales.filter(col('sale_amount').isNull()).show()



- Hence all the null values are treated

## Now, it's time to save the data in the silver layer for futher processing.
## Need to save in the delta format.

In [0]:
inventory.coalesce(1).write.mode('overwrite').parquet('dbfs:/mnt/shopping_data_dbfs/Silver-layer/BG-Thushara/capstone-azure-training/raw/main/Data/New_data/inventory_data.parquet')



In [0]:
sales.coalesce(1).write.mode('overwrite').parquet('dbfs:/mnt/shopping_data_dbfs/Silver-layer/BG-Thushara/capstone-azure-training/raw/main/Data/New_data/sales_data.parquet')

