In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, Window as W
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType
from datetime import datetime

In [2]:
spark = SparkSession.builder.appName("DailyCodingProblem-27-08-2025").getOrCreate()

In [3]:
df = spark.read.format("csv") \
    .options(header=True, inferSchema=True) \
    .load("/home/jupyter/work/data/sources/csv/Day-09/user_sessions.csv")

## **Problem: Advanced User Session Analytics**

A user login website tracks detailed user activities across sessions. For each user per day, you need to calculate:

1. **Number of Logins** – Count of login events per day.
2. **Session Times** – Duration of each session (between login and logout) per day.
3. **Total Usage Time** – Sum of all session durations per day.
4. **Average Usage Time** – Average session duration per day.
5. **Most Used Device** – Device used most frequently per day.
6. **Top Location** – Location from where the user accessed most events per day.
7. **Unique Browsers** – Count of distinct browsers used per day.
8. **Event Counts** – Count of each event type per day (stored separately).

### **Additional Notes**

* Events belong to different sessions identified by `session_id`.
* A session starts at `login` and ends at `logout`. If no `logout`, assume session ends at **23:59:59** for that day.
* Event types include: `login`, `logout`, `click`, `view`, `purchase`.

---

## **Sample Input from CSV file (Simplified)**

| user\_id | event\_time         | event\_type | device  | location | browser | session\_id |
| -------- | ------------------- | ----------- | ------- | -------- | ------- | ----------- |
| U1       | 2025-08-27 09:00:00 | login       | mobile  | India    | Chrome  | S101        |
| U1       | 2025-08-27 09:15:00 | click       | mobile  | India    | Chrome  | S101        |
| U1       | 2025-08-27 09:45:00 | logout      | mobile  | India    | Chrome  | S101        |
| U2       | 2025-08-27 10:00:00 | login       | desktop | USA      | Firefox | S202        |
| U2       | 2025-08-27 10:30:00 | view        | desktop | USA      | Firefox | S202        |
| U2       | 2025-08-27 11:00:00 | logout      | desktop | USA      | Firefox | S202        |

---

## **Expected Output 1: Main Metrics**

| user\_id | date       | num\_logins | session\_times(mins) | total\_usage(mins) | avg\_usage(mins) | most\_device | top\_location | unique\_browsers |
| -------- | ---------- | ----------- | -------------------- | ------------------ | ---------------- | ------------ | ------------- | ---------------- |
| U1       | 2025-08-27 | 1           | \[45]                | 45                 | 45.0             | mobile       | India         | 1                |
| U2       | 2025-08-27 | 1           | \[60]                | 60                 | 60.0             | desktop      | USA           | 1                |

---

## **Expected Output 2: Event Counts (Separate Table)**

| user\_id | date       | login\_count | logout\_count | click\_count | view\_count | purchase\_count |
| -------- | ---------- | ------------ | ------------- | ------------ | ----------- | --------------- |
| U1       | 2025-08-27 | 1            | 1             | 1            | 0           | 0               |
| U2       | 2025-08-27 | 1            | 1             | 0            | 1           | 0               |

---

In [4]:
df = df.withColumn("event_time", F.col("event_time").cast("timestamp"))
df = df.withColumn("date", F.to_date("event_time"))

In [26]:
df.show()

+-------+-------------------+----------+-------+---------+-------+----------+----------+
|user_id|         event_time|event_type| device| location|browser|session_id|      date|
+-------+-------------------+----------+-------+---------+-------+----------+----------+
|    U21|2025-08-12 16:17:06|  purchase|desktop|Australia| Chrome|     S9440|2025-08-12|
|     U2|2025-08-12 04:31:21|  purchase| tablet|Australia|Firefox|     S9876|2025-08-12|
|    U18|2025-08-17 14:03:12|  purchase|desktop|    India|Firefox|     S7075|2025-08-17|
|    U45|2025-08-20 01:56:42|      view|desktop|Australia| Safari|     S7144|2025-08-20|
|    U21|2025-08-25 19:31:51|     login| mobile|   Canada| Safari|     S3814|2025-08-25|
|     U3|2025-08-13 12:16:10|     click|desktop|Australia|Firefox|     S6097|2025-08-13|
|     U5|2025-08-02 05:58:30|    logout| tablet|    India|  Opera|     S2232|2025-08-02|
|    U19|2025-08-21 04:21:33|  purchase| tablet|      USA| Safari|     S7053|2025-08-21|
|     U5|2025-08-21 1

In [5]:
event_counts = (
    df.groupBy("user_id", "date")
      .pivot("event_type", ["login","logout","click","view","purchase"])
      .count()
      .na.fill(0)
)

In [25]:
event_counts.show()

+-------+----------+-----+------+-----+----+--------+
|user_id|      date|login|logout|click|view|purchase|
+-------+----------+-----+------+-----+----+--------+
|    U43|2025-08-06|    2|     3|    0|   0|       0|
|    U31|2025-08-16|    1|     0|    0|   1|       1|
|    U17|2025-08-14|    0|     0|    0|   0|       1|
|     U8|2025-08-25|    0|     1|    0|   0|       1|
|     U9|2025-08-22|    1|     0|    0|   0|       0|
|    U39|2025-08-28|    1|     0|    0|   0|       0|
|    U29|2025-08-04|    0|     0|    0|   0|       2|
|    U38|2025-08-03|    0|     1|    0|   0|       0|
|    U49|2025-08-03|    1|     0|    0|   0|       0|
|    U19|2025-08-03|    0|     1|    1|   1|       0|
|    U19|2025-08-27|    1|     0|    0|   1|       1|
|     U6|2025-08-25|    1|     0|    0|   0|       0|
|    U25|2025-08-12|    2|     1|    1|   0|       0|
|    U39|2025-08-13|    0|     1|    0|   1|       0|
|    U35|2025-08-11|    0|     0|    0|   0|       1|
|    U16|2025-08-16|    0|  

In [6]:
login_logout = df.filter(F.col("event_type").isin("login","logout"))

In [27]:
login_logout.show()

+-------+-------------------+----------+-------+---------+-------+----------+----------+
|user_id|         event_time|event_type| device| location|browser|session_id|      date|
+-------+-------------------+----------+-------+---------+-------+----------+----------+
|    U21|2025-08-25 19:31:51|     login| mobile|   Canada| Safari|     S3814|2025-08-25|
|     U5|2025-08-02 05:58:30|    logout| tablet|    India|  Opera|     S2232|2025-08-02|
|     U5|2025-08-21 19:18:40|     login| tablet|Australia|  Opera|     S8274|2025-08-21|
|     U6|2025-08-23 04:20:13|     login| mobile|   Canada|  Opera|     S6102|2025-08-23|
|     U2|2025-08-16 00:27:15|    logout| mobile|Australia| Safari|     S9693|2025-08-16|
|    U33|2025-08-12 10:19:44|    logout| tablet|    India|   Edge|     S4664|2025-08-12|
|    U39|2025-08-10 01:58:28|     login| tablet|Australia|   Edge|     S5201|2025-08-10|
|    U47|2025-08-16 04:48:14|     login| tablet|       UK|Firefox|     S6267|2025-08-16|
|    U46|2025-08-17 1

In [7]:
datetime.now()

datetime.datetime(2025, 9, 1, 12, 56, 59, 298674)

In [8]:
session_times = (
    login_logout.groupBy("user_id","date","session_id")
    .agg(
        F.min(F.when(F.col("event_type")=="login", F.col("event_time"))).alias("login_time"),
        F.max(F.when(F.col("event_type")=="logout", F.col("event_time"))).alias("logout_time")
    )
    .withColumn(
        "end_time",
        F.when(
            F.col("logout_time").isNull(),
            F.expr("to_timestamp(concat(date, ' 23:59:59'))")
        ).otherwise(F.col("logout_time"))
    )
    .withColumn(
        "session_minutes",
        F.round((F.unix_timestamp("end_time") - F.unix_timestamp("login_time"))/60, 2)
    )
    .filter(F.col("login_time").isNotNull())
)


In [28]:
session_times.show()

+-------+----------+----------+-------------------+-----------+-------------------+---------------+
|user_id|      date|session_id|         login_time|logout_time|           end_time|session_minutes|
+-------+----------+----------+-------------------+-----------+-------------------+---------------+
|    U47|2025-08-08|     S7637|2025-08-08 23:03:50|       NULL|2025-08-08 23:59:59|          56.15|
|     U7|2025-08-22|     S5034|2025-08-22 06:01:27|       NULL|2025-08-22 23:59:59|        1078.53|
|     U3|2025-08-23|     S3106|2025-08-23 17:16:14|       NULL|2025-08-23 23:59:59|         403.75|
|    U50|2025-08-22|     S4743|2025-08-22 23:12:00|       NULL|2025-08-22 23:59:59|          47.98|
|     U8|2025-08-26|     S1498|2025-08-26 13:58:10|       NULL|2025-08-26 23:59:59|         601.82|
|    U45|2025-08-12|     S6767|2025-08-12 14:57:46|       NULL|2025-08-12 23:59:59|         542.22|
|    U50|2025-08-17|     S5022|2025-08-17 15:21:38|       NULL|2025-08-17 23:59:59|         518.35|


In [31]:
session_times.filter((F.col('user_id') == 'U8') & (F.col('date') == '2025-08-13')).show()

+-------+----------+----------+-------------------+-----------+-------------------+---------------+
|user_id|      date|session_id|         login_time|logout_time|           end_time|session_minutes|
+-------+----------+----------+-------------------+-----------+-------------------+---------------+
|     U8|2025-08-13|     S3271|2025-08-13 20:15:43|       NULL|2025-08-13 23:59:59|         224.27|
|     U8|2025-08-13|     S6573|2025-08-13 12:07:57|       NULL|2025-08-13 23:59:59|         712.03|
|     U8|2025-08-13|     S8042|2025-08-13 20:25:39|       NULL|2025-08-13 23:59:59|         214.33|
+-------+----------+----------+-------------------+-----------+-------------------+---------------+



In [11]:
session_metrics = (
    session_times.groupBy("user_id","date")
    .agg(
        F.collect_list("session_minutes").alias("session_times"),
        F.sum("session_minutes").alias("total_usage"),
        F.avg("session_minutes").alias("avg_usage"),
        F.count("login_time").alias("num_logins")
    )
)

In [29]:
session_metrics.show()

+-------+----------+----------------+-----------+---------+----------+
|user_id|      date|   session_times|total_usage|avg_usage|num_logins|
+-------+----------+----------------+-----------+---------+----------+
|     U1|2025-08-02|         [304.4]|      304.4|    304.4|         1|
|     U1|2025-08-08|        [492.97]|     492.97|   492.97|         1|
|     U1|2025-08-22|        [240.72]|     240.72|   240.72|         1|
|     U1|2025-08-25|        [235.48]|     235.48|   235.48|         1|
|     U1|2025-08-26|        [325.35]|     325.35|   325.35|         1|
|    U10|2025-08-07|        [817.87]|     817.87|   817.87|         1|
|    U10|2025-08-10|        [338.92]|     338.92|   338.92|         1|
|    U10|2025-08-11|       [1193.62]|    1193.62|  1193.62|         1|
|    U10|2025-08-19|        [846.25]|     846.25|   846.25|         1|
|    U11|2025-08-28|        [1282.0]|     1282.0|   1282.0|         1|
|    U12|2025-08-11|[821.87, 331.23]|     1153.1|   576.55|         2|
|    U

In [32]:
device_counts = (
    df.groupBy("user_id","date","device")
      .count()
)
device_counts.show()

+-------+----------+-------+-----+
|user_id|      date| device|count|
+-------+----------+-------+-----+
|    U15|2025-08-16|desktop|    1|
|    U38|2025-08-05| tablet|    1|
|     U1|2025-08-26| tablet|    1|
|     U3|2025-08-08| mobile|    1|
|     U5|2025-08-13| mobile|    1|
|    U12|2025-08-06| tablet|    1|
|     U8|2025-08-25| tablet|    1|
|    U47|2025-08-13|desktop|    1|
|    U31|2025-08-16| mobile|    2|
|    U34|2025-08-20| mobile|    1|
|    U37|2025-08-14| mobile|    1|
|    U11|2025-08-01| tablet|    1|
|    U19|2025-08-18|desktop|    1|
|    U41|2025-08-12| tablet|    1|
|    U40|2025-08-18| tablet|    1|
|     U8|2025-08-24| tablet|    1|
|    U32|2025-08-09|desktop|    1|
|    U40|2025-08-10|desktop|    1|
|    U13|2025-08-06|desktop|    1|
|    U28|2025-08-28|desktop|    1|
+-------+----------+-------+-----+
only showing top 20 rows



In [14]:
w = W.partitionBy("user_id","date").orderBy(F.col("count").desc())

In [34]:
most_device_df = (
    device_counts.withColumn("rn", F.row_number().over(w))
                 .filter(F.col("rn")==1)
                 .select("user_id","date", F.col("device").alias("most_device"), 'count')
)
most_device_df.show()

+-------+----------+-----------+-----+
|user_id|      date|most_device|count|
+-------+----------+-----------+-----+
|     U1|2025-08-01|     mobile|    1|
|     U1|2025-08-02|     mobile|    2|
|     U1|2025-08-04|     tablet|    1|
|     U1|2025-08-05|     tablet|    1|
|     U1|2025-08-07|     tablet|    1|
|     U1|2025-08-08|    desktop|    1|
|     U1|2025-08-10|     mobile|    1|
|     U1|2025-08-11|     mobile|    1|
|     U1|2025-08-12|    desktop|    1|
|     U1|2025-08-15|     mobile|    1|
|     U1|2025-08-22|    desktop|    1|
|     U1|2025-08-24|     mobile|    1|
|     U1|2025-08-25|    desktop|    1|
|     U1|2025-08-26|     tablet|    1|
|    U10|2025-08-02|     tablet|    1|
|    U10|2025-08-03|     mobile|    2|
|    U10|2025-08-04|     mobile|    1|
|    U10|2025-08-06|     mobile|    1|
|    U10|2025-08-07|    desktop|    3|
|    U10|2025-08-09|    desktop|    1|
+-------+----------+-----------+-----+
only showing top 20 rows



In [35]:
location_counts = (
    df.groupBy("user_id","date","location")
      .count()
)

location_counts.show()

+-------+----------+---------+-----+
|user_id|      date| location|count|
+-------+----------+---------+-----+
|    U19|2025-08-21|      USA|    2|
|    U15|2025-08-23|       UK|    1|
|    U16|2025-08-20|   Canada|    1|
|    U25|2025-08-26|       UK|    1|
|    U23|2025-08-09|Australia|    1|
|    U14|2025-08-18|Australia|    1|
|    U15|2025-08-04|      USA|    1|
|    U43|2025-08-06|       UK|    3|
|    U35|2025-08-03|      USA|    1|
|    U41|2025-08-05|    India|    1|
|    U37|2025-08-14|Australia|    1|
|    U34|2025-08-11|      USA|    1|
|    U43|2025-08-19|    India|    1|
|    U27|2025-08-03|   Canada|    1|
|     U8|2025-08-16|   Canada|    1|
|    U49|2025-08-08|      USA|    1|
|    U14|2025-08-23|Australia|    1|
|    U20|2025-08-21|    India|    1|
|    U30|2025-08-15|   Canada|    1|
|    U20|2025-08-24|      USA|    1|
+-------+----------+---------+-----+
only showing top 20 rows



In [17]:
w2 = W.partitionBy("user_id","date").orderBy(F.col("count").desc())

In [36]:
top_location_df = (
    location_counts.withColumn("rn", F.row_number().over(w2))
                   .filter(F.col("rn")==1)
                   .select("user_id","date", F.col("location").alias("top_location"), 'count')
)

top_location_df.show()

+-------+----------+------------+-----+
|user_id|      date|top_location|count|
+-------+----------+------------+-----+
|     U1|2025-08-01|          UK|    2|
|     U1|2025-08-02|   Australia|    2|
|     U1|2025-08-04|   Australia|    1|
|     U1|2025-08-05|         USA|    1|
|     U1|2025-08-07|         USA|    1|
|     U1|2025-08-08|   Australia|    1|
|     U1|2025-08-10|         USA|    1|
|     U1|2025-08-11|          UK|    1|
|     U1|2025-08-12|       India|    2|
|     U1|2025-08-15|         USA|    1|
|     U1|2025-08-22|   Australia|    1|
|     U1|2025-08-24|         USA|    1|
|     U1|2025-08-25|          UK|    1|
|     U1|2025-08-26|   Australia|    1|
|    U10|2025-08-02|       India|    1|
|    U10|2025-08-03|       India|    1|
|    U10|2025-08-04|       India|    1|
|    U10|2025-08-06|       India|    1|
|    U10|2025-08-07|          UK|    1|
|    U10|2025-08-09|          UK|    1|
+-------+----------+------------+-----+
only showing top 20 rows



In [37]:
unique_browsers_df = (
    df.groupBy("user_id","date")
      .agg(F.countDistinct("browser").alias("unique_browsers"))
)

unique_browsers_df.show()

+-------+----------+---------------+
|user_id|      date|unique_browsers|
+-------+----------+---------------+
|    U31|2025-08-16|              3|
|     U8|2025-08-25|              2|
|    U43|2025-08-06|              3|
|     U9|2025-08-22|              1|
|    U17|2025-08-14|              1|
|    U29|2025-08-04|              2|
|    U39|2025-08-28|              1|
|    U19|2025-08-03|              3|
|    U49|2025-08-03|              1|
|    U38|2025-08-03|              1|
|    U35|2025-08-11|              1|
|    U25|2025-08-12|              3|
|    U19|2025-08-27|              3|
|    U39|2025-08-13|              1|
|     U6|2025-08-25|              1|
|    U12|2025-08-24|              1|
|    U16|2025-08-16|              2|
|    U32|2025-08-21|              2|
|    U29|2025-08-12|              1|
|     U7|2025-08-06|              1|
+-------+----------+---------------+
only showing top 20 rows



In [20]:
other_metrics = (
    most_device_df.join(top_location_df, ["user_id","date"], "outer")
                  .join(unique_browsers_df, ["user_id","date"], "outer")
)

In [21]:
final_output = (
    session_metrics.join(other_metrics, ["user_id","date"], "outer")
                   .join(event_counts, ["user_id","date"], "outer")
                   .na.fill(0)
)

In [22]:
final_output.show(truncate=False)  

+-------+----------+-------------+-----------+---------+----------+-----------+------------+---------------+-----+------+-----+----+--------+
|user_id|date      |session_times|total_usage|avg_usage|num_logins|most_device|top_location|unique_browsers|login|logout|click|view|purchase|
+-------+----------+-------------+-----------+---------+----------+-----------+------------+---------------+-----+------+-----+----+--------+
|U1     |2025-08-01|NULL         |0.0        |0.0      |0         |mobile     |UK          |2              |0    |0     |1    |0   |1       |
|U1     |2025-08-02|[304.4]      |304.4      |304.4    |1         |mobile     |Australia   |2              |1    |2     |1    |0   |0       |
|U1     |2025-08-04|NULL         |0.0        |0.0      |0         |tablet     |Australia   |1              |0    |0     |0    |0   |1       |
|U1     |2025-08-05|NULL         |0.0        |0.0      |0         |tablet     |USA         |1              |0    |0     |0    |1   |0       |
|U1   

In [24]:
df.filter((F.col("user_id")=="U1") & (F.to_date("event_time")=="2025-08-08")).orderBy("event_time").show(truncate=False)

+-------+-------------------+----------+-------+---------+-------+----------+----------+
|user_id|event_time         |event_type|device |location |browser|session_id|date      |
+-------+-------------------+----------+-------+---------+-------+----------+----------+
|U1     |2025-08-08 15:47:01|login     |desktop|Australia|Safari |S9952     |2025-08-08|
+-------+-------------------+----------+-------+---------+-------+----------+----------+

