## Data Profiling Flipkart Products Dataset

In [1]:
%pip install delta_spark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, BooleanType, IntegerType, FloatType
from pyspark.sql import functions as F
from delta.pip_utils import configure_spark_with_delta_pip

In [4]:
JAR_PACKAGES = ",".join([str(x) for x in Path("../jars").glob("*.jar")])

In [5]:
builder = SparkSession.builder.appName("delta").master("spark://spark:7077") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", JAR_PACKAGES) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key","datalake") \
    .config("spark.hadoop.fs.s3a.secret.key","datalake") \
    .config("spark.hadoop.fs.s3a.endpoint","http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

In [6]:
spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

In [7]:
ecommerce_schema = StructType([
    StructField("uniq_id", StringType(), False),
    StructField("crawl_timestamp", StringType()),
    StructField("product_url", StringType()),
    StructField("product_name", StringType()),
    StructField("product_category_tree", StringType()),
    StructField("pid", StringType()),
    StructField("retail_price", FloatType()),
    StructField("discounted_price", FloatType()),
    StructField("image", StringType()),
    StructField("is_FK_Advantage_product", StringType()),
    StructField("description", StringType()),
    StructField("product_rating", StringType()),
    StructField("overall_rating", StringType()),
    StructField("brand", StringType()),
    StructField("product_specifications", StringType())
])
dict_specs_schema = StructType([
    StructField("key", StringType()),
    StructField("value", StringType()),
])

In [8]:
bronze_container_path = "s3a://bronze"

df = spark.read.format("csv") \
    .option("escape", '"') \
    .option("multiLine", True) \
    .option("header", True) \
    .schema(ecommerce_schema) \
    .load(f"{bronze_container_path}/flipkart_ecommerce.csv")

In [9]:
df.printSchema()

root
 |-- uniq_id: string (nullable = true)
 |-- crawl_timestamp: string (nullable = true)
 |-- product_url: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category_tree: string (nullable = true)
 |-- pid: string (nullable = true)
 |-- retail_price: float (nullable = true)
 |-- discounted_price: float (nullable = true)
 |-- image: string (nullable = true)
 |-- is_FK_Advantage_product: string (nullable = true)
 |-- description: string (nullable = true)
 |-- product_rating: string (nullable = true)
 |-- overall_rating: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- product_specifications: string (nullable = true)



### Calculating DataFrame Statistics

In [10]:
df.describe().show(vertical=True)

-RECORD 0---------------------------------------
 summary                 | count                
 uniq_id                 | 20000                
 crawl_timestamp         | 20000                
 product_url             | 20000                
 product_name            | 20000                
 product_category_tree   | 20000                
 pid                     | 20000                
 retail_price            | 19922                
 discounted_price        | 19922                
 image                   | 19997                
 is_FK_Advantage_product | 20000                
 description             | 19998                
 product_rating          | 20000                
 overall_rating          | 20000                
 brand                   | 14139                
 product_specifications  | 19986                
-RECORD 1---------------------------------------
 summary                 | mean                 
 uniq_id                 | null                 
 crawl_timestamp    

### Getting Each Field's Cardinality

In [11]:
df.select(
    F.countDistinct(df.uniq_id).alias("uniq_id"),
    F.countDistinct(df.crawl_timestamp).alias("crawl_timestamp"),
    F.countDistinct(df.product_url).alias("product_url"),
    F.countDistinct(df.product_name).alias("product_name"),
    F.countDistinct(df.pid).alias("pid"),
    F.countDistinct(df.retail_price).alias("retail_price"),
    F.countDistinct(df.discounted_price).alias("discounted_price"),
    F.countDistinct(df.is_FK_Advantage_product).alias("is_FK_Advantage_product"),
    F.countDistinct(df.description).alias("description"),
    F.countDistinct(df.product_rating).alias("product_rating"),
    F.countDistinct(df.overall_rating).alias("overall_rating"),
    F.countDistinct(df.brand).alias("brand"),
    F.countDistinct(df.product_category_tree).alias("product_category_tree"),
    F.countDistinct(df.image).alias("image"),
    F.countDistinct(df.product_specifications).alias("product_specifications")
).show()

+-------+---------------+-----------+------------+-----+------------+----------------+-----------------------+-----------+--------------+--------------+-----+---------------------+-----+----------------------+
|uniq_id|crawl_timestamp|product_url|product_name|  pid|retail_price|discounted_price|is_FK_Advantage_product|description|product_rating|overall_rating|brand|product_category_tree|image|product_specifications|
+-------+---------------+-----------+------------+-----+------------+----------------+-----------------------+-----------+--------------+--------------+-----+---------------------+-----+----------------------+
|  20000|            371|      20000|       12676|19998|        2247|            2448|                      2|      17539|            36|            36| 3500|                 6466|18589|                 18825|
+-------+---------------+-----------+------------+-----+------------+----------------+-----------------------+-----------+--------------+--------------+-----+--

#### Observation
- uniq_id, product_url, and pid has the highest cardinality fields
- crawl_timestamp has 371 distinct values which is "just right" for partitioning the data

### Counting Null Values

In [12]:
df.select(
    F.count(F.when(F.col("uniq_id").isNull(), 1)).alias("uniq_id"),
    F.count(F.when(F.col("crawl_timestamp").isNull(), 1)).alias("crawl_timestamp"),
    F.count(F.when(F.col("product_url").isNull(), 1)).alias("product_url"),
    F.count(F.when(F.col("product_name").isNull(), 1)).alias("product_name"),
    F.count(F.when(F.col("pid").isNull(), 1)).alias("pid"),
    F.count(F.when(F.col("retail_price").isNull(), 1)).alias("retail_price"),
    F.count(F.when(F.col("discounted_price").isNull(), 1)).alias("discounted_price"),
    F.count(F.when(F.col("is_FK_Advantage_product").isNull(), 1)).alias("is_FK_Advantage_product"),
    F.count(F.when(F.col("description").isNull(), 1)).alias("description"),
    F.count(F.when(F.col("product_rating").isNull(), 1)).alias("product_rating"),
    F.count(F.when(F.col("overall_rating").isNull(), 1)).alias("overall_rating"),
    F.count(F.when(F.col("brand").isNull(), 1)).alias("brand"),
    F.count(F.when(F.col("product_category_tree").isNull(), 1)).alias("product_category_tree"),
    F.count(F.when(F.col("image").isNull(), 1)).alias("image"),
    F.count(F.when(F.col("product_specifications").isNull(), 1)).alias("product_specifications"),
).distinct().show()

+-------+---------------+-----------+------------+---+------------+----------------+-----------------------+-----------+--------------+--------------+-----+---------------------+-----+----------------------+
|uniq_id|crawl_timestamp|product_url|product_name|pid|retail_price|discounted_price|is_FK_Advantage_product|description|product_rating|overall_rating|brand|product_category_tree|image|product_specifications|
+-------+---------------+-----------+------------+---+------------+----------------+-----------------------+-----------+--------------+--------------+-----+---------------------+-----+----------------------+
|      0|              0|          0|           0|  0|          78|              78|                      0|          2|             0|             0| 5861|                    0|    3|                    14|
+-------+---------------+-----------+------------+---+------------+----------------+-----------------------+-----------+--------------+--------------+-----+------------

#### Observation
- 6 out of 15 fields contains null values
- There are few products that does not have a price
- The majority comes from the brand field

### Similarity of Ratings

In [13]:
df.select(
    F.count(F.when(F.col("product_rating") == F.col("overall_rating"), 1)).alias("similar_ratings")
).show()

+---------------+
|similar_ratings|
+---------------+
|          20000|
+---------------+



#### Observation
- All products have similar ratings which can be either the owner of the product is only selling a single product or they have a consistent customer service and product quality 


### Inspecting Each Field's Value

In [14]:
df.sample(fraction=0.0025, seed=99).show(5, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 uniq_id                 | c497e4250ea07a8a87efa8fda01ae6e0                                                                                                                                                                                                                                                                                                                       

#### Observation

- uniq_id - is a uuid type field which makes a good candidate for this table's primary key
- crawl_timestamp - is a UTC timestamp field which spans from December 2015 to June 2016 (fact_product_review)
- product_url - the webscrape url which is a metadata or a housekeeping field. (dim_web_metadata)
- product_name - the name of the product, (dim_product)
- pid - the product ID, (dim_product)
- retail_price - the selling retail price, (fact_product_review)
- discounted_price - the discounted price, (fact_product_review)
- is_FK_Advantage_product - whether or not the product has a flipkart fast and safe delivery guarantee (fact_product_performance).
- description - the product's description which is a text field containing escape characters like \n, \t, and \r (dim_product)
- product_rating - the product rating specific to the product page. Most products does not have a rating or has a value of No rating available (fact_product_review)
- overall_rating - the product rating specific to the seller. Most products does not have a rating or has a value of No rating available (fact_product_review)
- brand - the product's brand or manufacturer (dim_brand)
- product_category_tree - is an array that contains single string value of the category hierarchy which the product belongs into, slice into 3 different hierarchies (dim_main_category, dim_category, dim_sub_category)
- image - is an array of image urls associated to the product (dim_product)
- product_specifications - is a json like field which consists of feature or metadata pertinent to the product ie. Type, Color, Width, Height, Weight, etc. (dim_specification)


### Inspecting Values of Complex Data Typed Fields

- As observed, there are 3 fields `(product_category_tree, image, product_specifications)` that has a complex data type.
- I decided to not focus on the deep dive for the image field, I think it's worth investigating if ever there are use cases for machine learning modeling. 

#### Inspecting The product_specifications Field

In [15]:
product_specs_df = df.withColumn("product_specifications", F.regexp_replace(df.product_specifications, "=>", ":")) \
    .withColumn("product_specifications", F.get_json_object("product_specifications", "$.product_specification")) \
    .withColumn("product_specifications", F.from_json(F.col("product_specifications"), ArrayType(elementType=dict_specs_schema))) \
    .withColumn("product_specifications", F.explode(F.col("product_specifications"))) \
    .withColumn("product_specifications_key", F.col("product_specifications").getField("key")) \
    .withColumn("product_specifications_value", F.col("product_specifications").getField("value")) \
    .drop("product_specifications")

product_specs_df.createOrReplaceTempView("product_specs")

In [16]:
spark.sql("""
    SELECT product_specifications_key
          ,product_specifications_value 
    FROM product_specs;
"""
).show(truncate=False)

+-----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_specifications_key         |product_specifications_value                                                                                                                                                                 |
+-----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Number of Contents in Sales Package|Pack of 3                                                                                                                                                                                    |
|Fabric                             |Cotton Lycra                                       

In [17]:
spark.sql("""
    SELECT COUNT(1) AS total_keys
          ,COUNT(DISTINCT product_specifications_key) AS distinct_keys
          ,COUNT(IF(product_specifications_key IS NULL, 1, NULL)) AS null_keys 
    FROM product_specs;
"""
).show()

+----------+-------------+---------+
|total_keys|distinct_keys|null_keys|
+----------+-------------+---------+
|    237411|         2149|    11920|
+----------+-------------+---------+



In [18]:
spark.sql("""
    SELECT COUNT(product_specifications_key) AS cnt
          ,product_specifications_key
    FROM product_specs
    GROUP BY product_specifications_key
    ORDER BY cnt DESC;
"""
).show(truncate=False)

+-----+-----------------------------------+
|cnt  |product_specifications_key         |
+-----+-----------------------------------+
|13629|Type                               |
|13138|Ideal For                          |
|9954 |Occasion                           |
|9746 |Color                              |
|9281 |Brand                              |
|7284 |Number of Contents in Sales Package|
|6733 |Sales Package                      |
|6374 |Model Number                       |
|6254 |Pattern                            |
|6249 |Fabric                             |
|4945 |Pack of                            |
|4788 |Model Name                         |
|4417 |Material                           |
|4382 |Style Code                         |
|4242 |Weight                             |
|3829 |Sleeve                             |
|3523 |Precious/Artificial Jewellery      |
|3389 |Base Material                      |
|3250 |Width                              |
|3005 |Height                   

#### Observation
- After flattening, there are 237,411 key/value pairs in the product_specifications field having 5% null values
- There are 2149 distinct keys which I will narrow down and get only the top 6 specifications based on frequency namely: `(Type, Ideal For, Occasion, Color, Brand, Number of Contents in Sales Package)`

#### Inspecting The product_category_tree Field

In [19]:
product_category_df = (df.withColumn("product_category_tree", F.from_json(F.col("product_category_tree"), ArrayType(elementType=StringType())).getItem(0))
                           .withColumn("product_category_tree", F.split(F.col("product_category_tree"), " >> "))
                           .withColumn("product_category_count", F.size(F.col("product_category_tree")))
)

product_category_df.createOrReplaceTempView("product_category")

In [20]:
spark.sql("""
    SELECT AVG(product_category_count)
    FROM product_category
"""
).show()

+---------------------------+
|avg(product_category_count)|
+---------------------------+
|                     4.3492|
+---------------------------+



In [21]:
product_category_df = product_category_df.withColumn("product_category_tree_first_5", F.slice(F.col("product_category_tree"), start=1, length=5))

#### Observation
- The average depth of each array in the product_category_tree was 4.3, which means 4 to 5 category levels will be a good starting point