In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName="jupyter")

from pyspark.sql import SparkSession
se = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-10-09 02:30:29,444 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [2]:
! hadoop fs -copyFromLocal clickstream.csv

copyFromLocal: `clickstream.csv': File exists


In [3]:
# register table

clickstream = se.read.option("header", True).option("delimiter", "\t").csv("clickstream.csv")
clickstream.registerTempTable("clickstream")

                                                                                

In [4]:
# sql query

result = se.sql('''
select route, count(route) as cnt from
    (select user_id, session_id, concat_ws("-", collect_list(event_page)) as route from
        (select pages.user_id, pages.session_id, event_page, lag(event_page) over (partition by pages.user_id, pages.session_id order by timestamp) as prev_page from
            (select user_id, session_id, event_page, timestamp from clickstream
            where event_type = 'page'
            distribute by user_id, session_id) pages
        left join
            (select user_id, session_id, min(timestamp) as firsttimeerror from clickstream
            where event_type like '%error%'
            group by user_id, session_id
            distribute by user_id, session_id) errors
        on pages.user_id = errors.user_id and pages.session_id = errors.session_id
        where firsttimeerror is null or timestamp < firsttimeerror
        order by user_id, session_id, timestamp)
    where event_page != prev_page or prev_page is null
    group by user_id, session_id
    distribute by user_id, session_id)
group by route
order by cnt desc
limit 30''')
result.show(30)
result.toPandas().to_csv('sql_output.csv', header=False, index=False, sep='\t')

                                                                                

+--------------------+----+
|               route| cnt|
+--------------------+----+
|                main|8184|
|        main-archive|1113|
|         main-rabota|1047|
|       main-internet| 897|
|          main-bonus| 870|
|           main-news| 769|
|        main-tariffs| 677|
|         main-online| 587|
|          main-vklad| 518|
| main-rabota-archive| 170|
| main-archive-rabota| 167|
|  main-bonus-archive| 143|
|   main-rabota-bonus| 139|
|   main-bonus-rabota| 135|
|    main-news-rabota| 135|
|main-archive-inte...| 132|
|    main-rabota-news| 130|
|main-internet-rabota| 129|
|   main-archive-news| 126|
|main-rabota-internet| 124|
|main-internet-arc...| 123|
|  main-archive-bonus| 117|
| main-internet-bonus| 115|
|main-tariffs-inte...| 114|
|   main-news-archive| 113|
|  main-news-internet| 109|
|main-archive-tariffs| 104|
|  main-internet-news| 103|
|main-tariffs-archive| 103|
|    main-rabota-main|  94|
+--------------------+----+



                                                                                

In [5]:
sc.stop()