In [16]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, coalesce, lit
from datetime import date

In [2]:
# 1. Создаем сессию Spark
spark = SparkSession.builder \
    .appName("UsersActivity") \
    .getOrCreate()

In [4]:
data = [
    (101, date(2025, 1, 1), {"mobile": 3, "desktop": 1}, ["/home", "/products", "/cart"], 4.5),
    (102, date(2025, 1, 1), {"desktop": 2}, ["/home", "/about"], 3.0),
    (101, date(2025, 1, 2), {"mobile": 2}, ["/products", "/checkout"], None),
    (103, date(2025, 1, 2), {"tablet": 1, "mobile": 1}, ["/blog", "/contact"], 5.0), 
    (104, date(2025, 1, 3), {"desktop": 4}, ["/dashboard"], 3.5),
    (101, date(2025, 1, 3), {"mobile": 1, "desktop": 1}, ["/home", "/products"], 4.0),
    (105, date(2025, 1, 4), {"mobile": 5}, ["/faq"], None),
    (102, date(2025, 1, 4), {"desktop": 1, "mobile": 1}, ["/settings"], 3.8),
    (103, date(2025, 1, 5), {"tablet": 2}, ["/products"], 4.2), 
    (106, date(2025, 1, 5), {"desktop": 3, "mobile": 2}, ["/login", "/profile", "/home"], 4.7),
    (101, date(2025, 1, 6), {"mobile": 1}, ["/cart", "/checkout"], 4.0),
    (104, date(2025, 1, 6), {"desktop": 2, "tablet": 1}, ["/contact"], None),
    (105, date(2025, 1, 7), {"mobile": 3, "desktop": 1}, ["/pricing"], 4.1),
    (106, date(2025, 1, 7), {"desktop": 1}, ["/home", "/about"], 3.9),
    (107, date(2025, 1, 8), {"mobile": 4, "tablet": 2}, ["/products", "/blog"], 4.9) 
]

In [5]:
# 2. Создаем DataFrame
df = spark.createDataFrame(data, ["user_id", "activity_date", "sessions_by_device", "visited_pages", "usability_rating"])

In [6]:
# 3. Выводим схему DataFrame
df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- activity_date: date (nullable = true)
 |-- sessions_by_device: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- visited_pages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- usability_rating: double (nullable = true)



In [7]:
# 4. Выводим первые 5 строк DataFrame
df.show(5, truncate=False)

+-------+-------------+---------------------------+-------------------------+----------------+
|user_id|activity_date|sessions_by_device         |visited_pages            |usability_rating|
+-------+-------------+---------------------------+-------------------------+----------------+
|101    |2025-01-01   |{mobile -> 3, desktop -> 1}|[/home, /products, /cart]|4.5             |
|102    |2025-01-01   |{desktop -> 2}             |[/home, /about]          |3.0             |
|101    |2025-01-02   |{mobile -> 2}              |[/products, /checkout]   |NULL            |
|103    |2025-01-02   |{mobile -> 1, tablet -> 1} |[/blog, /contact]        |5.0             |
|104    |2025-01-03   |{desktop -> 4}             |[/dashboard]             |3.5             |
+-------+-------------+---------------------------+-------------------------+----------------+
only showing top 5 rows


In [11]:
# 5. Рассчитайте total_sessions_count для каждой записи (строки): 
# Создайте новую колонку total_sessions_count. 
# Значение этой колонки должно быть суммой количества сессий по всем устройствам (mobile, desktop, tablet) для каждой записи. 

df_with_total_sessions = df.withColumn(
    "total_sessions_count",
    coalesce(col("sessions_by_device.mobile"), lit(0)) +
    coalesce(col("sessions_by_device.desktop"), lit(0)) +
    coalesce(col("sessions_by_device.tablet"), lit(0))
)
print("Total sessions count added:")
df_with_total_sessions.show(5, truncate=False)

Total sessions count added:
+-------+-------------+---------------------------+-------------------------+----------------+--------------------+
|user_id|activity_date|sessions_by_device         |visited_pages            |usability_rating|total_sessions_count|
+-------+-------------+---------------------------+-------------------------+----------------+--------------------+
|101    |2025-01-01   |{mobile -> 3, desktop -> 1}|[/home, /products, /cart]|4.5             |4                   |
|102    |2025-01-01   |{desktop -> 2}             |[/home, /about]          |3.0             |2                   |
|101    |2025-01-02   |{mobile -> 2}              |[/products, /checkout]   |NULL            |2                   |
|103    |2025-01-02   |{mobile -> 1, tablet -> 1} |[/blog, /contact]        |5.0             |2                   |
|104    |2025-01-03   |{desktop -> 4}             |[/dashboard]             |3.5             |4                   |
+-------+-------------+---------------------

In [12]:
# 6. Извлеките количество мобильных сессий: 
# Создайте новую колонку mobile_sessions, которая будет содержать количество сессий, проведенных с "mobile" устройства из sessions_by_device. 
# Если данных о "mobile" устройстве нет в данных для данной записи, значение должно быть 0.
df_with_mobile_sessions = df_with_total_sessions.withColumn(
    "mobile_sessions",
    coalesce(col("sessions_by_device.mobile"), lit(0))
)
print("Mobile sessions added:")
df_with_mobile_sessions.show(5, truncate=False)

Mobile sessions added:
+-------+-------------+---------------------------+-------------------------+----------------+--------------------+---------------+
|user_id|activity_date|sessions_by_device         |visited_pages            |usability_rating|total_sessions_count|mobile_sessions|
+-------+-------------+---------------------------+-------------------------+----------------+--------------------+---------------+
|101    |2025-01-01   |{mobile -> 3, desktop -> 1}|[/home, /products, /cart]|4.5             |4                   |3              |
|102    |2025-01-01   |{desktop -> 2}             |[/home, /about]          |3.0             |2                   |0              |
|101    |2025-01-02   |{mobile -> 2}              |[/products, /checkout]   |NULL            |2                   |2              |
|103    |2025-01-02   |{mobile -> 1, tablet -> 1} |[/blog, /contact]        |5.0             |2                   |1              |
|104    |2025-01-03   |{desktop -> 4}             |[/

In [13]:
# 7. Для каждого пользователя рассчитайте total_sessions_all_time: Это общее количество всех сессий пользователя за весь период наблюдений. Отсортируйте результат по total_sessions_all_time в порядке убывания.
total_sessions_all_time = df_with_mobile_sessions.groupBy("user_id") \
    .agg({"total_sessions_count": "sum"}) \
    .withColumnRenamed("sum(total_sessions_count)", "total_sessions_all_time") \
    .orderBy(col("total_sessions_all_time").desc())
print("Total sessions all time for each user:")
total_sessions_all_time.show(truncate=False)

Total sessions all time for each user:
+-------+-----------------------+
|user_id|total_sessions_all_time|
+-------+-----------------------+
|101    |9                      |
|105    |9                      |
|104    |7                      |
|106    |6                      |
|107    |6                      |
|102    |4                      |
|103    |4                      |
+-------+-----------------------+



In [17]:
# 8. Для каждого пользователя получите unique_visited_pages_all_time: Это список всех уникальных посещенных страниц пользователя за весь период.
unique_visited_pages_all_time = df_with_mobile_sessions \
    .withColumn("page", F.explode_outer("visited_pages")) \
    .groupBy("user_id") \
    .agg(F.collect_set("page").alias("unique_visited_pages_all_time"))

print("Unique visited pages all time for each user:")
unique_visited_pages_all_time.show(truncate=False)

Unique visited pages all time for each user:
+-------+------------------------------------+
|user_id|unique_visited_pages_all_time       |
+-------+------------------------------------+
|101    |[/products, /checkout, /home, /cart]|
|102    |[/about, /settings, /home]          |
|103    |[/products, /contact, /blog]        |
|104    |[/dashboard, /contact]              |
|105    |[/pricing, /faq]                    |
|106    |[/profile, /about, /login, /home]   |
|107    |[/products, /blog]                  |
+-------+------------------------------------+



In [18]:
# 9. Отфильтруйте DataFrame df_user_activity, чтобы показать только те записи, где usability_rating выше 3.5. Отсортируйте результат по usability_rating в порядке убывания.
filtered_user_activity = df_with_mobile_sessions \
    .filter(col("usability_rating") > 3.5) \
    .orderBy(col("usability_rating").desc())
print("Filtered user activity with usability_rating > 3.5:")
filtered_user_activity.show(truncate=False)

Filtered user activity with usability_rating > 3.5:
+-------+-------------+---------------------------+-------------------------+----------------+--------------------+---------------+
|user_id|activity_date|sessions_by_device         |visited_pages            |usability_rating|total_sessions_count|mobile_sessions|
+-------+-------------+---------------------------+-------------------------+----------------+--------------------+---------------+
|103    |2025-01-02   |{mobile -> 1, tablet -> 1} |[/blog, /contact]        |5.0             |2                   |1              |
|107    |2025-01-08   |{mobile -> 4, tablet -> 2} |[/products, /blog]       |4.9             |6                   |4              |
|106    |2025-01-05   |{mobile -> 2, desktop -> 3}|[/login, /profile, /home]|4.7             |5                   |2              |
|101    |2025-01-01   |{mobile -> 3, desktop -> 1}|[/home, /products, /cart]|4.5             |4                   |3              |
|103    |2025-01-05   |{