In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
# 1. Создаем сессию Spark
spark = SparkSession.builder \
    .appName("UserSessions") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

In [3]:
# Загрузка данных из файла Avro
df = spark.read.format("avro").load("activity_log.avro")

In [4]:
# Выведите схему и первые 10 строк
df.printSchema()
df.show(10, truncate=False)

root
 |-- event_id: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- session_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- page_url: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- amount: double (nullable = true)

+--------+-------+----------+------------+-----------------+--------------------+----------+------+
|event_id|user_id|session_id|event_type  |timestamp        |page_url            |product_id|amount|
+--------+-------+----------+------------+-----------------+--------------------+----------+------+
|E001    |101    |S101-A    |login       |01-Jan-2025 10:00|/login              |NULL      |NULL  |
|E002    |101    |S101-A    |view_page   |01-Jan-2025 10:05|/products/category_A|NULL      |NULL  |
|E003    |101    |S101-A    |view_product|01-Jan-2025 10:08|/product/P005       |P005      |NULL  |
|E004    |101    |S101-A    |add_to_cart |01-Jan-2025 10:10|/cart/add

In [5]:
# Преобразование колонки timestamp в TimestampType
df = df.withColumn("event_time", F.to_timestamp("timestamp", "dd-MMM-yyyy HH:mm"))
df.show(5)

+--------+-------+----------+------------+-----------------+--------------------+----------+------+-------------------+
|event_id|user_id|session_id|  event_type|        timestamp|            page_url|product_id|amount|         event_time|
+--------+-------+----------+------------+-----------------+--------------------+----------+------+-------------------+
|    E001|    101|    S101-A|       login|01-Jan-2025 10:00|              /login|      NULL|  NULL|2025-01-01 10:00:00|
|    E002|    101|    S101-A|   view_page|01-Jan-2025 10:05|/products/category_A|      NULL|  NULL|2025-01-01 10:05:00|
|    E003|    101|    S101-A|view_product|01-Jan-2025 10:08|       /product/P005|      P005|  NULL|2025-01-01 10:08:00|
|    E004|    101|    S101-A| add_to_cart|01-Jan-2025 10:10|           /cart/add|      P005|  NULL|2025-01-01 10:10:00|
|    E005|    101|    S101-A|    purchase|01-Jan-2025 10:15|           /checkout|      P005| 120.5|2025-01-01 10:15:00|
+--------+-------+----------+-----------

In [6]:
# Посчитайте количество уникальных событий для каждого дня
(df.groupBy(F.col("event_time").cast("date").alias("date"))
        .agg(F.count("event_id").alias("total_events"))
        .orderBy("date").show()
)

+----------+------------+
|      date|total_events|
+----------+------------+
|2025-01-01|           6|
|2025-01-02|           3|
|2025-01-03|           3|
|2025-01-04|           2|
|2025-01-05|           6|
|2025-01-06|           5|
|2025-01-07|           5|
|2025-01-08|           4|
|2025-01-09|           6|
+----------+------------+



In [7]:
#  Найдите количество уникальных пользователей
print("Количество уникальных пользователей")
df.select("user_id").distinct().orderBy("user_id").show()

Количество уникальных пользователей
+-------+
|user_id|
+-------+
|    101|
|    102|
|    103|
|    104|
|    105|
|    106|
|    107|
+-------+



In [8]:
# Найдите общее количество уникальных сессий
print("Количество уникальных сессий")
print(df.select("session_id").distinct().count())

Количество уникальных сессий
9


In [9]:
# Количество уникальных сессий на пользователя
(df.groupBy("user_id").agg(
    F.countDistinct("session_id").alias("unique_sessions_count")
    ).orderBy("user_id").show())

+-------+---------------------+
|user_id|unique_sessions_count|
+-------+---------------------+
|    101|                    2|
|    102|                    2|
|    103|                    1|
|    104|                    1|
|    105|                    1|
|    106|                    1|
|    107|                    1|
+-------+---------------------+



In [10]:
# Количество событий в каждой сессии
(df.groupBy("user_id", "session_id")
    .agg(F.count("*").alias("count_sessions"))
    .orderBy("count_sessions")
    .show()
)

+-------+----------+--------------+
|user_id|session_id|count_sessions|
+-------+----------+--------------+
|    104|    S104-D|             2|
|    102|    S102-B|             3|
|    103|    S103-C|             3|
|    106|    S106-F|             4|
|    102|    S102-C|             5|
|    101|    S101-B|             5|
|    101|    S101-A|             6|
|    105|    S105-E|             6|
|    107|    S107-G|             6|
+-------+----------+--------------+



In [11]:
# Посчитайте общее количество покупок
(df.filter(F.col("event_type") == "purchase")
    .agg(F.count(F.col("amount")).alias("count_purchase_amount"))
    .show()
)

+---------------------+
|count_purchase_amount|
+---------------------+
|                    4|
+---------------------+



In [12]:
# Найдите общую сумму всех покупок
(df.filter(F.col("event_type") == "purchase")
    .agg(F.sum(F.col("amount")).alias("sum_purchase_amount"))
    .show()
)

+-------------------+
|sum_purchase_amount|
+-------------------+
|              905.5|
+-------------------+



In [13]:
# Посчитайте среднюю сумму покупки
(df.filter(F.col("event_type") == "purchase")
    .agg(F.round(F.avg(F.col("amount")), 2).alias("average_purchase_amount"))
    .show()
)

+-----------------------+
|average_purchase_amount|
+-----------------------+
|                 226.38|
+-----------------------+



In [14]:
# Расчет продолжительности каждой сессии
df_session_durations = df.groupBy("user_id", "session_id").agg(
    F.min("event_time").alias("min_time"),
    F.max("event_time").alias("max_time")
).withColumn(
    "session_duration_seconds",
    (F.unix_timestamp(F.col("max_time")) - F.unix_timestamp(F.col("min_time")))
)
df_session_durations.orderBy("user_id", "session_id").show(truncate=False)

+-------+----------+-------------------+-------------------+------------------------+
|user_id|session_id|min_time           |max_time           |session_duration_seconds|
+-------+----------+-------------------+-------------------+------------------------+
|101    |S101-A    |2025-01-01 10:00:00|2025-01-01 10:20:00|1200                    |
|101    |S101-B    |2025-01-06 09:30:00|2025-01-06 09:40:00|600                     |
|102    |S102-B    |2025-01-02 11:30:00|2025-01-02 11:38:00|480                     |
|102    |S102-C    |2025-01-07 15:00:00|2025-01-07 15:18:00|1080                    |
|103    |S103-C    |2025-01-03 09:00:00|2025-01-03 09:10:00|600                     |
|104    |S104-D    |2025-01-04 14:00:00|2025-01-04 14:03:00|180                     |
|105    |S105-E    |2025-01-05 16:45:00|2025-01-05 17:02:00|1020                    |
|106    |S106-F    |2025-01-08 10:00:00|2025-01-08 10:10:00|600                     |
|107    |S107-G    |2025-01-09 11:00:00|2025-01-09 11:

In [15]:
# Средняя продолжительность сессии
(df_session_durations
    .agg(F.round(F.avg(F.col("session_duration_seconds")),2).alias("average_session_duration_seconds"))
    .show()
)

+--------------------------------+
|average_session_duration_seconds|
+--------------------------------+
|                           740.0|
+--------------------------------+

