In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, coalesce, try_to_timestamp, lit, trim, upper, regexp_replace

In [2]:
spark = SparkSession.builder \
.appName('E-Commerce Website') \
.getOrCreate()

#PHASE 1 – Ingestion

1. Read clickstream.csv with all elds as StringType.


In [3]:
df_raw = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("/content/clickstream.csv")

2. Show schema and row count.


In [4]:
df_raw.printSchema()
print("Row count:", df_raw.count())

root
 |-- session_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- page: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- device: string (nullable = true)
 |-- city: string (nullable = true)
 |-- status: string (nullable = true)

Row count: 200000


3. Display sample records.


In [5]:
df_raw.show(10, False)

+----------+-------+--------+----------------+----------+-----+-------------------+-------+---------+---------+
|session_id|user_id|page    |event_type      |product_id|price|event_time         |device |city     |status   |
+----------+-------+--------+----------------+----------+-----+-------------------+-------+---------+---------+
|S92190    |U6115  |Cart    |VIEW            |P884      |NULL |01/01/2026 10:00:00|Desktop|Mumbai   |FAILED   |
|S54070    |U7007  |Product |PAYMENT         |P227      |2007 |2026-01-01 10:00:02|Mobile |Pune     |ABANDONED|
|S71940    |U5946  |Cart    |REMOVE_FROM_CART|P122      |NULL |2026-01-01 10:00:04|Desktop|Bangalore|FAILED   |
|S83079    |U2525  |Product |REMOVE_FROM_CART|P290      |NULL |2026-01-01 10:00:06|Desktop|Chennai  |ABANDONED|
|S49645    |U4633  |Cart    |ADD_TO_CART     |P143      |NULL |2026-01-01 10:00:08|Mobile |Bangalore|SUCCESS  |
|S31144    |U8525  |Home    |CANCEL          |P391      |NULL |2026-01-01 10:00:10|Desktop|Chennai  |SUC

4. Explain why schema inference is dangerous here

In [6]:
#price sometimes is empty or malformed so wrong numeric inference
#event_time has multiple formats so inference failure
#inconsistent strings(view, VIEW, View)
#Production logs often contain corrupt rows
#So it is better to ingest raw logs as string and clean explicitly.

#PHASE 2 – Cleaning


1. Trim all string columns.



In [8]:
for c in df_raw.columns:
  df_raw = df_raw.withColumn(c, trim(col(c)))
df_raw.show(10, False)

+----------+-------+--------+----------------+----------+-----+-------------------+-------+---------+---------+
|session_id|user_id|page    |event_type      |product_id|price|event_time         |device |city     |status   |
+----------+-------+--------+----------------+----------+-----+-------------------+-------+---------+---------+
|S92190    |U6115  |Cart    |VIEW            |P884      |NULL |01/01/2026 10:00:00|Desktop|Mumbai   |FAILED   |
|S54070    |U7007  |Product |PAYMENT         |P227      |2007 |2026-01-01 10:00:02|Mobile |Pune     |ABANDONED|
|S71940    |U5946  |Cart    |REMOVE_FROM_CART|P122      |NULL |2026-01-01 10:00:04|Desktop|Bangalore|FAILED   |
|S83079    |U2525  |Product |REMOVE_FROM_CART|P290      |NULL |2026-01-01 10:00:06|Desktop|Chennai  |ABANDONED|
|S49645    |U4633  |Cart    |ADD_TO_CART     |P143      |NULL |2026-01-01 10:00:08|Mobile |Bangalore|SUCCESS  |
|S31144    |U8525  |Home    |CANCEL          |P391      |NULL |2026-01-01 10:00:10|Desktop|Chennai  |SUC

2. Standardize:
page
event_type
status


In [9]:
df_clean = df_raw \
.withColumn("page", upper(col("page"))) \
.withColumn("event_type", upper(col("event_type"))) \
.withColumn("status", upper(col("status")))
df_clean.show(10, False)

+----------+-------+--------+----------------+----------+-----+-------------------+-------+---------+---------+
|session_id|user_id|page    |event_type      |product_id|price|event_time         |device |city     |status   |
+----------+-------+--------+----------------+----------+-----+-------------------+-------+---------+---------+
|S92190    |U6115  |CART    |VIEW            |P884      |NULL |01/01/2026 10:00:00|Desktop|Mumbai   |FAILED   |
|S54070    |U7007  |PRODUCT |PAYMENT         |P227      |2007 |2026-01-01 10:00:02|Mobile |Pune     |ABANDONED|
|S71940    |U5946  |CART    |REMOVE_FROM_CART|P122      |NULL |2026-01-01 10:00:04|Desktop|Bangalore|FAILED   |
|S83079    |U2525  |PRODUCT |REMOVE_FROM_CART|P290      |NULL |2026-01-01 10:00:06|Desktop|Chennai  |ABANDONED|
|S49645    |U4633  |CART    |ADD_TO_CART     |P143      |NULL |2026-01-01 10:00:08|Mobile |Bangalore|SUCCESS  |
|S31144    |U8525  |HOME    |CANCEL          |P391      |NULL |2026-01-01 10:00:10|Desktop|Chennai  |SUC

3. Clean price:

Remove commas

Convert to DoubleType

Replace invalid values with null


In [24]:
df_clean = df_clean.withColumn("price", regexp_replace(col("price"), "[^0-9.]", "").try_cast("double"))
df_clean.show(10, False)

+----------+-------+--------+----------------+----------+------+-------------------+-------+---------+---------+-------------------+
|session_id|user_id|page    |event_type      |product_id|price |event_time         |device |city     |status   |event_time_clean   |
+----------+-------+--------+----------------+----------+------+-------------------+-------+---------+---------+-------------------+
|S92190    |U6115  |CART    |VIEW            |P884      |NULL  |01/01/2026 10:00:00|Desktop|Mumbai   |FAILED   |2026-01-01 10:00:00|
|S54070    |U7007  |PRODUCT |PAYMENT         |P227      |2007.0|2026-01-01 10:00:02|Mobile |Pune     |ABANDONED|2026-01-01 10:00:02|
|S71940    |U5946  |CART    |REMOVE_FROM_CART|P122      |NULL  |2026-01-01 10:00:04|Desktop|Bangalore|FAILED   |2026-01-01 10:00:04|
|S83079    |U2525  |PRODUCT |REMOVE_FROM_CART|P290      |NULL  |2026-01-01 10:00:06|Desktop|Chennai  |ABANDONED|2026-01-01 10:00:06|
|S49645    |U4633  |CART    |ADD_TO_CART     |P143      |NULL  |2026-

4. Parse event_time into:
event_time_clean (TimestampType)
Supporting:
yyyy-MM-dd HH:mm:ss
dd/MM/yyyy HH:mm:ss
yyyy/MM/dd HH:mm:ss

In [18]:
df_clean = df_clean.withColumn(
    "event_time_clean",
    coalesce(
    try_to_timestamp(col("event_time"), lit("yyyy-MM-dd HH:mm:ss")),
    try_to_timestamp(col("event_time"), lit("dd/MM/yyyy HH:mm:ss")),
    try_to_timestamp(col("event_time"), lit("yyyy/MM/dd HH:mm:ss"))
))
df_clean.show(10, False)

+----------+-------+--------+----------------+----------+------+-------------------+-------+---------+---------+-------------------+
|session_id|user_id|page    |event_type      |product_id|price |event_time         |device |city     |status   |event_time_clean   |
+----------+-------+--------+----------------+----------+------+-------------------+-------+---------+---------+-------------------+
|S92190    |U6115  |CART    |VIEW            |P884      |NULL  |01/01/2026 10:00:00|Desktop|Mumbai   |FAILED   |2026-01-01 10:00:00|
|S54070    |U7007  |PRODUCT |PAYMENT         |P227      |2007.0|2026-01-01 10:00:02|Mobile |Pune     |ABANDONED|2026-01-01 10:00:02|
|S71940    |U5946  |CART    |REMOVE_FROM_CART|P122      |NULL  |2026-01-01 10:00:04|Desktop|Bangalore|FAILED   |2026-01-01 10:00:04|
|S83079    |U2525  |PRODUCT |REMOVE_FROM_CART|P290      |NULL  |2026-01-01 10:00:06|Desktop|Chennai  |ABANDONED|2026-01-01 10:00:06|
|S49645    |U4633  |CART    |ADD_TO_CART     |P143      |NULL  |2026-

#PHASE 3 – Session Validation



1. Identify incomplete sessions.


In [19]:
incomplete_sessions = df_clean.filter(col("event_type") != "PAYMENT")

print("Number of incomplete sessions:", incomplete_sessions.select("session_id").distinct().count())
incomplete_sessions.show(5, False)

Number of incomplete sessions: 75879
+----------+-------+-------+----------------+----------+-----+-------------------+-------+---------+---------+-------------------+
|session_id|user_id|page   |event_type      |product_id|price|event_time         |device |city     |status   |event_time_clean   |
+----------+-------+-------+----------------+----------+-----+-------------------+-------+---------+---------+-------------------+
|S92190    |U6115  |CART   |VIEW            |P884      |NULL |01/01/2026 10:00:00|Desktop|Mumbai   |FAILED   |2026-01-01 10:00:00|
|S71940    |U5946  |CART   |REMOVE_FROM_CART|P122      |NULL |2026-01-01 10:00:04|Desktop|Bangalore|FAILED   |2026-01-01 10:00:04|
|S83079    |U2525  |PRODUCT|REMOVE_FROM_CART|P290      |NULL |2026-01-01 10:00:06|Desktop|Chennai  |ABANDONED|2026-01-01 10:00:06|
|S49645    |U4633  |CART   |ADD_TO_CART     |P143      |NULL |2026-01-01 10:00:08|Mobile |Bangalore|SUCCESS  |2026-01-01 10:00:08|
|S31144    |U8525  |HOME   |CANCEL          |P

2. Identify sessions that reached:

VIEW

ADD_TO_CART

CHECKOUT

PAYMENT

In [28]:
sessions_reached_view = df_clean.filter(col("event_type") == "VIEW").select("session_id").distinct()
sessions_reached_add_to_cart = df_clean.filter(col("event_type") == "ADD_TO_CART").select("session_id").distinct()
sessions_reached_checkout = df_clean.filter(col("event_type") == "CHECKOUT").select("session_id").distinct()
sessions_reached_payment = df_clean.filter(col("event_type") == "PAYMENT").select("session_id").distinct()

print("Number of sessions that reached VIEW:", sessions_reached_view.count())
print("Number of sessions that reached ADD_TO_CART:", sessions_reached_add_to_cart.count())
print("Number of sessions that reached CHECKOUT:", sessions_reached_checkout.count())
print("Number of sessions that reached PAYMENT:", sessions_reached_payment.count())


Number of sessions that reached VIEW: 28058
Number of sessions that reached ADD_TO_CART: 28121
Number of sessions that reached CHECKOUT: 27640
Number of sessions that reached PAYMENT: 27947


2. Remove duplicate events inside a session.


In [35]:
df_dedup = df_clean.dropDuplicates(["session_id", "event_type", "event_time_clean", "product_id"])
print("Original row count (before duplicate removal):", df_clean.count())
print("Number of rows after removing duplicates:", df_dedup.count())

Original row count (before duplicate removal): 200000
Number of rows after removing duplicates: 200000


3. Filter only meaningful sessions.

In [36]:
meaningful_sessions_ids = df_clean.filter(
    (col("event_type") == "ADD_TO_CART") | (col("event_type") == "PAYMENT")
).select("session_id").distinct()

df_meaningful = df_dedup.join(meaningful_sessions_ids, on="session_id", how="inner")
print("Number of meaningful sessions:", meaningful_sessions_ids.count())
print("Number of rows in meaningful sessions (after removing duplicates):", df_meaningful.count())

df_meaningful.show(5, False)

Number of meaningful sessions: 47263
Number of rows in meaningful sessions (after removing duplicates): 136816
+----------+-------+-------+-----------+----------+------+-------------------+------+---------+---------+-------------------+
|session_id|user_id|page   |event_type |product_id|price |event_time         |device|city     |status   |event_time_clean   |
+----------+-------+-------+-----------+----------+------+-------------------+------+---------+---------+-------------------+
|S10002    |U2912  |PRODUCT|ADD_TO_CART|P576      |NULL  |2026-01-04 14:34:16|Tablet|Mumbai   |ABANDONED|2026-01-04 14:34:16|
|S10002    |U2405  |HOME   |CHECKOUT   |P366      |NULL  |2026-01-06 00:42:36|Tablet|Mumbai   |ABANDONED|2026-01-06 00:42:36|
|S10002    |U5637  |SEARCH |PAYMENT    |P291      |2101.0|2026-01-02 21:49:58|Tablet|Bangalore|FAILED   |2026-01-02 21:49:58|
|S10003    |U7583  |CART   |CANCEL     |P113      |NULL  |2026-01-04 15:28:54|Mobile|Mumbai   |SUCCESS  |2026-01-04 15:28:54|
|S10003

#PHASE 4 – Conversion


1. Total sessions


In [37]:
total_sessions = df_meaningful.select("session_id").distinct().count()
print("Total sessions:", total_sessions)

Total sessions: 47263


2. Sessions with VIEW


In [38]:
sessions_with_view = df_meaningful.filter(col("event_type") == "VIEW").select("session_id").distinct().count()
print("Sessions with VIEW:", sessions_with_view)

Sessions with VIEW: 14616


3. Sessions with ADD_TO_CART


In [39]:
sessions_with_add_to_cart = df_meaningful.filter(col("event_type") == "ADD_TO_CART").select("session_id").distinct().count()
print("Sessions with ADD_TO_CART:", sessions_with_add_to_cart)

Sessions with ADD_TO_CART: 28121


4. Sessions with CHECKOUT


In [40]:
sessions_with_checkout = df_meaningful.filter(col("event_type") == "CHECKOUT").select("session_id").distinct().count()
print("Sessions with CHECKOUT:", sessions_with_checkout)

Sessions with CHECKOUT: 14524


5. Sessions with PAYMENT
Calculate:
conversion_rate = PAYMENT / VIEW
Deliverable: Funnel table

In [45]:
sessions_with_payment = df_meaningful.filter(col("event_type") == "PAYMENT").select("session_id").distinct().count()
print("Sessions with PAYMENT:", sessions_with_payment)

Sessions with PAYMENT: 27947


In [47]:
conversion_rate = (sessions_with_payment / sessions_with_view) * 100 if sessions_with_view > 0 else 0
print("Conversion Rate (PAYMENT / VIEW): {:.2f}%".format(conversion_rate))

funnel_data = [
    ("Total sessions", total_sessions),
    ("Sessions with VIEW", sessions_with_view),
    ("Sessions with ADD_TO_CART", sessions_with_add_to_cart),
    ("Sessions with CHECKOUT", sessions_with_checkout),
    ("Sessions with PAYMENT", sessions_with_payment)
]

funnel_df = spark.createDataFrame(funnel_data, ["Stage", "Count"])
funnel_df.show()

Conversion Rate (PAYMENT / VIEW): 191.21%
+--------------------+-----+
|               Stage|Count|
+--------------------+-----+
|      Total sessions|47263|
|  Sessions with VIEW|14616|
|Sessions with ADD...|28121|
|Sessions with CHE...|14524|
|Sessions with PAY...|27947|
+--------------------+-----+



#PHASE 5 – Revenue Analysis


1. Total revenue from PAYMENT events.


In [43]:
total_revenue = df_meaningful.filter(col("event_type") == "PAYMENT").select(sum("price")).collect()[0][0]
print("Total revenue from PAYMENT events:", total_revenue)

Total revenue from PAYMENT events: 68865078.0


2. Revenue per city.


In [46]:
from pyspark.sql.functions import sum

revenue_per_city = df_meaningful.filter(col("event_type") == "PAYMENT") \
                                .groupBy("city") \
                                .agg(sum("price").alias("total_revenue"))

print("Revenue per city:")
revenue_per_city.show()

Revenue per city:
+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|    9631096.0|
|  Chennai|    9822381.0|
|   Mumbai|  1.0267458E7|
|  Kolkata|    9804316.0|
|     Pune|    9678465.0|
|    Delhi|    9771651.0|
|Hyderabad|    9889711.0|
+---------+-------------+



3. Revenue per device.


In [48]:
revenue_per_device = df_meaningful.filter(col("event_type") == "PAYMENT") \
                                  .groupBy("device") \
                                  .agg(sum("price").alias("total_revenue"))

print("Revenue per device:")
revenue_per_device.show()

Revenue per device:
+-------+-------------+
| device|total_revenue|
+-------+-------------+
| Mobile|  2.3013943E7|
| Tablet|  2.2894107E7|
|Desktop|  2.2957028E7|
+-------+-------------+



4. Average order value.


In [49]:
from pyspark.sql.functions import avg

average_order_value = df_meaningful.filter(col("event_type") == "PAYMENT") \
                                   .agg(avg("price").alias("average_order_value"))

print("Average order value:")
average_order_value.show()

Average order value:
+-------------------+
|average_order_value|
+-------------------+
|  4144.503972075108|
+-------------------+



4. Top 10 products by revenue

In [50]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, desc

top_10_products_by_revenue = df_meaningful.filter(col("event_type") == "PAYMENT") \
                                        .groupBy("product_id") \
                                        .agg(sum("price").alias("total_revenue")) \
                                        .orderBy(desc("total_revenue")) \
                                        .limit(10)

print("Top 10 products by revenue:")
top_10_products_by_revenue.show()

Top 10 products by revenue:
+----------+-------------+
|product_id|total_revenue|
+----------+-------------+
|      P351|     159808.0|
|      P920|     146194.0|
|      P406|     144192.0|
|      P802|     140183.0|
|      P213|     131318.0|
|      P558|     130222.0|
|      P991|     130174.0|
|      P604|     128546.0|
|      P691|     128446.0|
|      P975|     128056.0|
+----------+-------------+



#PHASE 6 – Window Functions



1. Rank products by revenue per city.


In [55]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc, col, sum

window_spec_city = Window.partitionBy("city").orderBy(desc("total_revenue"))

revenue_per_product_per_city = df_meaningful.filter(col("event_type") == "PAYMENT") \
                                            .groupBy("city", "product_id") \
                                            .agg(sum("price").alias("total_revenue"))

ranked_products_by_revenue_per_city = revenue_per_product_per_city.withColumn("rank", rank().over(window_spec_city))

print("Rank of products by revenue per city:")
ranked_products_by_revenue_per_city.orderBy("city", "rank").show(10)

Rank of products by revenue per city:
+---------+----------+-------------+----+
|     city|product_id|total_revenue|rank|
+---------+----------+-------------+----+
|Bangalore|      P474|      45921.0|   1|
|Bangalore|      P920|      44444.0|   2|
|Bangalore|      P207|      39496.0|   3|
|Bangalore|      P655|      37851.0|   4|
|Bangalore|      P416|      36446.0|   5|
|Bangalore|      P806|      35214.0|   6|
|Bangalore|      P711|      33642.0|   7|
|Bangalore|      P415|      33539.0|   8|
|Bangalore|      P583|      33466.0|   9|
|Bangalore|      P275|      33193.0|  10|
+---------+----------+-------------+----+
only showing top 10 rows


2. Rank users by spending.


In [56]:
from pyspark.sql.functions import sum, desc
from pyspark.sql.window import Window

total_spending_per_user = df_meaningful.filter(col("event_type") == "PAYMENT") \
                                       .groupBy("user_id") \
                                       .agg(sum("price").alias("total_spent"))

window_spec_user = Window.orderBy(desc("total_spent"))
ranked_users_by_spending = total_spending_per_user.withColumn("rank", rank().over(window_spec_user))

print("Rank of users by spending:")
ranked_users_by_spending.show(10)

Rank of users by spending:
+-------+-----------+----+
|user_id|total_spent|rank|
+-------+-----------+----+
|  U8595|    50576.0|   1|
|  U5108|    47995.0|   2|
|  U1718|    44469.0|   3|
|  U4136|    43978.0|   4|
|  U9586|    41612.0|   5|
|  U7115|    39554.0|   6|
|  U7485|    38995.0|   7|
|  U6867|    38774.0|   8|
|  U2531|    38443.0|   9|
|  U3179|    37318.0|  10|
+-------+-----------+----+
only showing top 10 rows


3. Find top 3 products per city.


In [58]:
from pyspark.sql.functions import rank, desc
from pyspark.sql.window import Window

window_spec_city_rank = Window.partitionBy("city").orderBy(desc("total_revenue"))
top_3_products_per_city = revenue_per_product_per_city \
                                .withColumn("rank", rank().over(window_spec_city_rank)) \
                                .filter(col("rank") <= 3)

print("Top 3 products by revenue per city:")
top_3_products_per_city.orderBy("city", "rank").show()

Top 3 products by revenue per city:
+---------+----------+-------------+----+
|     city|product_id|total_revenue|rank|
+---------+----------+-------------+----+
|Bangalore|      P474|      45921.0|   1|
|Bangalore|      P920|      44444.0|   2|
|Bangalore|      P207|      39496.0|   3|
|  Chennai|      P147|      41393.0|   1|
|  Chennai|      P288|      37741.0|   2|
|  Chennai|      P991|      37466.0|   3|
|    Delhi|      P335|      51187.0|   1|
|    Delhi|      P510|      43706.0|   2|
|    Delhi|      P645|      37947.0|   3|
|Hyderabad|      P974|      41474.0|   1|
|Hyderabad|      P916|      40200.0|   2|
|Hyderabad|      P545|      39121.0|   3|
|  Kolkata|      P315|      49000.0|   1|
|  Kolkata|      P199|      37541.0|   2|
|  Kolkata|      P331|      37277.0|   3|
|   Mumbai|      P868|      41679.0|   1|
|   Mumbai|      P920|      41320.0|   2|
|   Mumbai|      P351|      40993.0|   3|
|     Pune|      P597|      36208.0|   1|
|     Pune|      P368|      34496.0|   2

4. Identify power users

In [59]:
from pyspark.sql.functions import sum, count, col, desc
from pyspark.sql.window import Window
user_activity = df_meaningful.filter(col("event_type") == "PAYMENT") \
                             .groupBy("user_id") \
                             .agg(
                                 sum("price").alias("total_spent"),
                                 count("session_id").alias("total_purchases")
                             )

window_spec_spending = Window.orderBy(desc("total_spent"))
window_spec_purchases = Window.orderBy(desc("total_purchases"))

ranked_user_activity = user_activity \
    .withColumn("spending_rank", rank().over(window_spec_spending)) \
    .withColumn("purchase_rank", rank().over(window_spec_purchases))

num_users = user_activity.count()
top_10_percent_threshold = num_users // 10

power_users = ranked_user_activity.filter(
    (col("spending_rank") <= top_10_percent_threshold) |
    (col("purchase_rank") <= top_10_percent_threshold)
)

print("Number of power users (top 10% by spending or purchases):", power_users.count())
print("Sample of power users:")
power_users.orderBy("spending_rank").show(10)

Number of power users (top 10% by spending or purchases): 1939
Sample of power users:
+-------+-----------+---------------+-------------+-------------+
|user_id|total_spent|total_purchases|spending_rank|purchase_rank|
+-------+-----------+---------------+-------------+-------------+
|  U8595|    50576.0|              9|            1|           37|
|  U5108|    47995.0|              9|            2|           37|
|  U1718|    44469.0|             11|            3|            3|
|  U4136|    43978.0|             11|            4|            3|
|  U9586|    41612.0|              7|            5|          297|
|  U7115|    39554.0|             10|            6|           16|
|  U7485|    38995.0|              8|            7|          119|
|  U6867|    38774.0|             11|            8|            3|
|  U2531|    38443.0|              8|            9|          119|
|  U3179|    37318.0|              8|           10|          119|
+-------+-----------+---------------+-------------+-----

#PHASE 7 – Abandonment Analysis


1. Sessions that reached CHECKOUT but no PAYMENT.


In [60]:
checkout_sessions = df_meaningful.filter(col("event_type") == "CHECKOUT").select("session_id").distinct()
payment_sessions = df_meaningful.filter(col("event_type") == "PAYMENT").select("session_id").distinct()

abandoned_checkout_sessions = checkout_sessions.join(
    payment_sessions,
    checkout_sessions.session_id == payment_sessions.session_id,
    "left_anti"
)

print("Number of sessions that reached CHECKOUT but not PAYMENT:", abandoned_checkout_sessions.count())
abandoned_checkout_sessions.show(5, False)

Number of sessions that reached CHECKOUT but not PAYMENT: 5979
+----------+
|session_id|
+----------+
|S10006    |
|S10043    |
|S10044    |
|S10051    |
|S10081    |
+----------+
only showing top 5 rows


2. Group abandonment by:
device
city


In [61]:
from pyspark.sql.functions import count
abandonment_by_device = df_meaningful.join(abandoned_checkout_sessions, on="session_id", how="inner") \
                                     .groupBy("device") \
                                     .agg(count("session_id").alias("abandoned_sessions_count"))

print("Abandonment by device:")
abandonment_by_device.show()
abandonment_by_city = df_meaningful.join(abandoned_checkout_sessions, on="session_id", how="inner") \
                                   .groupBy("city") \
                                   .agg(count("session_id").alias("abandoned_sessions_count"))

print("Abandonment by city:")
abandonment_by_city.show()

Abandonment by device:
+-------+------------------------+
| device|abandoned_sessions_count|
+-------+------------------------+
| Mobile|                    6873|
| Tablet|                    7150|
|Desktop|                    6987|
+-------+------------------------+

Abandonment by city:
+---------+------------------------+
|     city|abandoned_sessions_count|
+---------+------------------------+
|Bangalore|                    2967|
|  Chennai|                    2931|
|   Mumbai|                    3035|
|  Kolkata|                    2928|
|     Pune|                    2996|
|    Delhi|                    3112|
|Hyderabad|                    3041|
+---------+------------------------+



3. Find where abandonment is highest.

In [63]:
from pyspark.sql.functions import col, desc

highest_abandonment_device = abandonment_by_device.orderBy(col("abandoned_sessions_count").desc()).limit(1)
print("Device with highest abandonment:")
highest_abandonment_device.show()
highest_abandonment_city = abandonment_by_city.orderBy(col("abandoned_sessions_count").desc()).limit(1)
print("City with highest abandonment:")
highest_abandonment_city.show()

Device with highest abandonment:
+------+------------------------+
|device|abandoned_sessions_count|
+------+------------------------+
|Tablet|                    7150|
+------+------------------------+

City with highest abandonment:
+-----+------------------------+
| city|abandoned_sessions_count|
+-----+------------------------+
|Delhi|                    3112|
+-----+------------------------+

