In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName='jupyter')

from itertools import islice
from typing import List, Tuple

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-10-10 19:52:11,258 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [2]:
! hadoop fs -copyFromLocal clickstream.csv

copyFromLocal: `clickstream.csv': File exists


In [3]:
# make an initial rdd from csv

csv_rdd = sc.textFile('clickstream.csv').mapPartitionsWithIndex(lambda idx, it: islice(it, 1, None) if idx == 0 else it)
table_rdd = csv_rdd.map(lambda l: l.split('\t')).map(lambda x: (x[0] + '-' + x[1], (x[2], x[3], x[4])))
table_rdd.take(5)

                                                                                

[('562-507', ('page', 'main', '1695584127')),
 ('562-507', ('event', 'main', '1695584134')),
 ('562-507', ('event', 'main', '1695584144')),
 ('562-507', ('event', 'main', '1695584147')),
 ('562-507', ('wNaxLlerrorU', 'main', '1695584154'))]

In [4]:
# time of first error for each session

error_rdd = table_rdd.filter(lambda x: 'error' in x[1][0]).map(lambda x: (x[0], (x[1][2]))).reduceByKey(lambda a, b: a if a < b else b)
error_rdd.take(5)

                                                                                

[('3122-116', '1695592355'),
 ('455-220', '1695614507'),
 ('3916-644', '1695614983'),
 ('4908-135', '1695628535'),
 ('2976-922', '1695631611')]

In [5]:
# connecting pages and first error

left_joined_rdd = table_rdd.filter(lambda x: x[1][0] == 'page').leftOuterJoin(error_rdd)
left_joined_rdd.take(10)

                                                                                

[('1409-29', (('page', 'main', '1697373595'), '1697378675')),
 ('1409-29', (('page', 'vklad', '1697374001'), '1697378675')),
 ('1409-29', (('page', 'news', '1697375826'), '1697378675')),
 ('1409-29', (('page', 'rabota', '1697376818'), '1697378675')),
 ('1409-29', (('page', 'main', '1698205762'), '1697378675')),
 ('1409-29', (('page', 'rabota', '1698207334'), '1697378675')),
 ('1409-29', (('page', 'internet', '1698207499'), '1697378675')),
 ('1409-29', (('page', 'bonus', '1698208287'), '1697378675')),
 ('1409-29', (('page', 'rabota', '1698208483'), '1697378675')),
 ('1409-29', (('page', 'bonus', '1698209278'), '1697378675'))]

In [6]:
# page clipping after first error

faultless_rdd = left_joined_rdd.filter(lambda x: x[1][1] is None or x[1][0][2] < x[1][1]).map(lambda x: (x[0], (x[1][0][1], x[1][0][2])))
faultless_rdd.take(5)

[('833-954', ('main', '1695584670')),
 ('833-954', ('bonus', '1695584688')),
 ('833-954', ('main', '1695584695')),
 ('150-322', ('main', '1695584850')),
 ('4746-892', ('main', '1695591194'))]

In [7]:
# sorting pages by time of visit

sorted_rdd = faultless_rdd.groupByKey().mapValues(lambda values: sorted(values, key=lambda x: x[1]))
sorted_rdd.take(5)

                                                                                

[('1409-29',
  [('main', '1697373595'),
   ('vklad', '1697374001'),
   ('news', '1697375826'),
   ('rabota', '1697376818')]),
 ('1998-988', [('main', '1698201714')]),
 ('2671-557',
  [('main', '1698234623'),
   ('internet', '1698234757'),
   ('news', '1698235491'),
   ('vklad', '1698235604'),
   ('archive', '1698236236')]),
 ('716-290', [('main', '1698176129'), ('main', '1698242389')]),
 ('3374-716',
  [('main', '1698232677'),
   ('tariffs', '1698233674'),
   ('vklad', '1698234593'),
   ('news', '1698235149'),
   ('rabota', '1698242103'),
   ('online', '1698242203'),
   ('rabota', '1698245982')])]

In [8]:
def make_route(pages: List[Tuple[str, str]]) -> str:
    '''Generates a path without consecutive repetitions of the same page
    
    Example:
    
    Transforms: [('main', '1695614475'), ('archive', '1695614499')]
    Returns: 'main-archive'
    '''
    filtered_pages = []
    for page in pages:
        if not filtered_pages or page[0] != filtered_pages[-1]:
            filtered_pages.append(page[0])
    return '-'.join(filtered_pages)

In [9]:
# generating a route string from a list of pages, removing repetitions

route_rdd = sorted_rdd.map(lambda x: make_route(x[1]))
route_rdd.take(5)

['main-vklad-news-rabota',
 'main',
 'main-internet-news-vklad-archive',
 'main',
 'main-tariffs-vklad-news-rabota-online-rabota']

In [10]:
# counting the number of routes

sum_rdd = route_rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False).take(30)
sum_rdd

                                                                                

[('main', 8184),
 ('main-archive', 1113),
 ('main-rabota', 1047),
 ('main-internet', 897),
 ('main-bonus', 870),
 ('main-news', 769),
 ('main-tariffs', 677),
 ('main-online', 587),
 ('main-vklad', 518),
 ('main-rabota-archive', 170),
 ('main-archive-rabota', 167),
 ('main-bonus-archive', 143),
 ('main-rabota-bonus', 139),
 ('main-bonus-rabota', 135),
 ('main-news-rabota', 135),
 ('main-archive-internet', 132),
 ('main-rabota-news', 130),
 ('main-internet-rabota', 129),
 ('main-archive-news', 126),
 ('main-rabota-internet', 124),
 ('main-internet-archive', 123),
 ('main-archive-bonus', 117),
 ('main-internet-bonus', 115),
 ('main-tariffs-internet', 114),
 ('main-news-archive', 113),
 ('main-news-internet', 109),
 ('main-archive-tariffs', 104),
 ('main-internet-news', 103),
 ('main-tariffs-archive', 103),
 ('main-rabota-main', 94)]

In [11]:
# write to file

with open('rrd_output.csv', 'w') as f:
    for tuple in sum_rdd:
        print(f"{tuple[0]}\t{tuple[1]}", file=f)

In [12]:
sc.stop()