In [56]:
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "LoadCSVtoRDD")

# Path to the CSV file
file_path = "ws-logs_names_email_ip.csv"

# Load the CSV file into an RDD
rdd = sc.textFile(file_path)

# Split lines by delimiter (e.g., comma)
header = rdd.first()  # Get the header

data_rdd = rdd.filter(lambda row: row != header).map(lambda row: row.split(','))

header_s = header.split(',')
dict_rdd = data_rdd.map(lambda row: dict(zip(header_s, row)))

print(dict_rdd.first())

                                                                                

{'Time': '17/11/24; 00:00:55', 'UserId': 'user152', 'Event context': 'Course: Winter School 2024 on Data Systems', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141' in the course with id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168'}


In [57]:
from datetime import datetime

# Function to convert time to epoch
def convert_to_epoch(record):
    # Parse the datetime from the 'Time' field
    time_format = "%d/%m/%y; %H:%M:%S"
    epoch_time = int(datetime.strptime(record['Time'], time_format).timestamp())
    record['Time'] = epoch_time
    return record

# Convert 'Time' to epoch time
epoch_rdd = dict_rdd.map(convert_to_epoch)

# Example: print a record with time in epoch format
print(epoch_rdd.first())


{'Time': 1731781855, 'UserId': 'user152', 'Event context': 'Course: Winter School 2024 on Data Systems', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141' in the course with id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168'}


In [58]:
# Find total number of unique users
unique_users = dict_rdd.map(lambda record: record['UserId']).distinct().count()

print(f"Total number of unique users: {unique_users}")


Total number of unique users: 174


In [59]:
# Group the data by UserId
grouped_data = epoch_rdd.map(lambda record: (record['UserId'], record)).groupByKey()

# Convert grouped data to a more readable format (list of records per user)
grouped_data_readable = grouped_data.mapValues(list)

# Example: print grouped data for the first user
print(grouped_data_readable.first())


[Stage 5:>                                                          (0 + 1) / 1]

('user152', [{'Time': 1731781855, 'UserId': 'user152', 'Event context': 'Course: Winter School 2024 on Data Systems', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141' in the course with id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168'}, {'Time': 1731781867, 'UserId': 'user152', 'Event context': 'User: Christopher Lee', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141'.", 'Origin': 'web', 'IP address': '137.135.10.168'}, {'Time': 1731781878, 'UserId': 'user152', 'Event context': 'System', 'Component': 'System', 'Event name': 'Blog entries viewed', 'Description': "The user with id '2141' viewed blog entries.", 'Origin': 'web', 'IP address': '137.135.10.168'}, {'Time': 1731781884, 'UserId': 'user152', 'Event context': 'User: Christopher Lee', 'Component': 'Forum', 'Event name': 'User repo

                                                                                

In [62]:
sorted_users=grouped_data.mapValues(lambda records: sorted(records, key=lambda x: x['Time']))

print(sorted_users.mapValues(list).first())

('user152', [{'Time': 1731781855, 'UserId': 'user152', 'Event context': 'Course: Winter School 2024 on Data Systems', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141' in the course with id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168'}, {'Time': 1731781867, 'UserId': 'user152', 'Event context': 'User: Christopher Lee', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141'.", 'Origin': 'web', 'IP address': '137.135.10.168'}, {'Time': 1731781878, 'UserId': 'user152', 'Event context': 'System', 'Component': 'System', 'Event name': 'Blog entries viewed', 'Description': "The user with id '2141' viewed blog entries.", 'Origin': 'web', 'IP address': '137.135.10.168'}, {'Time': 1731781884, 'UserId': 'user152', 'Event context': 'User: Christopher Lee', 'Component': 'Forum', 'Event name': 'User repo

In [64]:
active_time = sorted_users.mapValues(
    lambda records: int(records[-1]['Time']) - int(records[0]['Time']) if len(records) > 1 else 0
)

# Example: print the total active time for each user
print(active_time.collect())

[('user152', 244275), ('user083', 171277), ('user166', 39789), ('user104', 248155), ('user161', 38566), ('user108', 41650), ('user057', 120292), ('user025', 214238), ('user142', 242411), ('user079', 212412), ('user151', 48), ('user096', 242027), ('user035', 36560), ('user066', 228112), ('user081', 33143), ('user173', 32457), ('user139', 32765), ('user126', 223170), ('user153', 213800), ('user110', 193854), ('user068', 17934), ('user089', 17413), ('user008', 11021), ('user072', 103367), ('user040', 17251), ('user117', 211164), ('user011', 16598), ('user137', 201528), ('user067', 214464), ('user171', 9786), ('user141', 8180), ('user125', 201325), ('user013', 215423), ('user164', 204111), ('user065', 184041), ('user123', 214141), ('user071', 177823), ('user009', 6209), ('user146', 169901), ('user080', 6808), ('user124', 214709), ('user047', 92718), ('user156', 214189), ('user077', 6341), ('user070', 3543), ('user010', 4759), ('user100', 179098), ('user005', 2954), ('user022', 127666), ('u

In [68]:
# Calculate StayTime for each entry
def calculate_stay_time(records):
    for i in range(len(records)):
        if i < len(records) - 1:
            records[i]['StayTime'] = records[i + 1]['Time'] - records[i]['Time']
        else:
            records[i]['StayTime'] = 0  # Small value for the last entry
    return records

stay_time_rdd = sorted_users.mapValues(calculate_stay_time)

# Example: print StayTime for the first user's entries
print(stay_time_rdd.mapValues(list).first())


('user152', [{'Time': 1731781855, 'UserId': 'user152', 'Event context': 'Course: Winter School 2024 on Data Systems', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141' in the course with id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168', 'StayTime': 12}, {'Time': 1731781867, 'UserId': 'user152', 'Event context': 'User: Christopher Lee', 'Component': 'System', 'Event name': 'User profile viewed', 'Description': "The user with id '2141' viewed the profile for the user with id '2141'.", 'Origin': 'web', 'IP address': '137.135.10.168', 'StayTime': 11}, {'Time': 1731781878, 'UserId': 'user152', 'Event context': 'System', 'Component': 'System', 'Event name': 'Blog entries viewed', 'Description': "The user with id '2141' viewed blog entries.", 'Origin': 'web', 'IP address': '137.135.10.168', 'StayTime': 6}, {'Time': 1731781884, 'UserId': 'user152', 'Event context': 'User: Christopher Lee',

In [None]:
# Filter records to keep only those with StayTime >= 1 second
filtered_rdd = stay_time_rdd.mapValues(
    lambda records: [record for record in records if record['StayTime'] >= 1000]
)

# Example: print the filtered records for the first user
print(filtered_rdd.first())

('user152', [{'Time': 1731782201, 'UserId': 'user152', 'Event context': 'Virtual programming lab: Sample Code', 'Component': 'Virtual programming lab', 'Event name': 'mod_vpl: vpl description viewed', 'Description': 'The user with id 2141 viewed description of VPL activity with id 5', 'Origin': 'web', 'IP address': '137.135.10.168', 'StayTime': 29622}, {'Time': 1731812004, 'UserId': 'user152', 'Event context': 'Virtual programming lab: Sample Code', 'Component': 'Virtual programming lab', 'Event name': 'mod_vpl: vpl description viewed', 'Description': 'The user with id 2141 viewed description of VPL activity with id 5', 'Origin': 'web', 'IP address': '137.135.10.168', 'StayTime': 1155}, {'Time': 1731813182, 'UserId': 'user152', 'Event context': 'Course: Winter School 2024 on Data Systems', 'Component': 'System', 'Event name': 'User list viewed', 'Description': "The user with id '2141' viewed the list of users in the course with id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168',

24/12/02 01:13:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 965278 ms exceeds timeout 120000 ms
24/12/02 01:13:32 WARN SparkContext: Killing executors is not supported by current scheduler.
24/12/02 01:30:13 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o

In [69]:
# Prepare RDD with (UserId, (StayTime, record)) for reduceByKey
user_staytime_rdd = stay_time_rdd.flatMapValues(lambda records: [(record['StayTime'], record) for record in records])

# Reduce to find the max and min StayTime for each user
max_staytime_rdd = user_staytime_rdd.reduceByKey(lambda a, b: a if a[0] > b[0] else b)
min_staytime_rdd = user_staytime_rdd.reduceByKey(lambda a, b: a if a[0] < b[0] else b)

# Collect and print the results
max_staytime_results = max_staytime_rdd.collect()
min_staytime_results = min_staytime_rdd.collect()

print("Max StayTime for each user:")
for user, (staytime, record) in max_staytime_results:
    print(f"User: {user}, Max StayTime: {staytime}, Record: {record}")

print("\nMin StayTime for each user:")
for user, (staytime, record) in min_staytime_results:
    print(f"User: {user}, Min StayTime: {staytime}, Record: {record}")


Max StayTime for each user:
User: user152, Max StayTime: 72673, Record: {'Time': 1731859428, 'UserId': 'user152', 'Event context': 'Forum: Announcements', 'Component': 'Forum', 'Event name': 'Discussion viewed', 'Description': "The user with id '2141' has viewed the discussion with id '3' in the forum with course module id '3'.", 'Origin': 'web', 'IP address': '137.135.10.168', 'StayTime': 72673}
User: user083, Max StayTime: 131554, Record: {'Time': 1731821522, 'UserId': 'user083', 'Event context': 'Forum: Announcements', 'Component': 'Forum', 'Event name': 'Course module viewed', 'Description': "The user with id '2439' viewed the 'forum' activity with course module id '3'.", 'Origin': 'web', 'IP address': '136.234.159.27', 'StayTime': 131554}
User: user166, Max StayTime: 33863, Record: {'Time': 1731783561, 'UserId': 'user166', 'Event context': 'Virtual programming lab: Sample Code', 'Component': 'Virtual programming lab', 'Event name': 'mod_vpl: submission evaluated', 'Description': '

In [18]:
sc.stop()

In [90]:
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "ImmutableData")

# Create a simple RDD
rdd = sc.parallelize([("user1", 100), ("user2", 200), ("user1", 300), ("user3", 400)])

# Perform a series of transformations
mapped_rdd = rdd.map(lambda x: (x[0], x[1] * 2))
grouped_rdd = mapped_rdd.groupByKey()

# Inspect the lineage of the RDD
print(f"Lineage: {grouped_rdd.toDebugString()}")

sc.stop()


Lineage: b'(1) PythonRDD[5] at RDD at PythonRDD.scala:53 []\n |  MapPartitionsRDD[4] at mapPartitions at PythonRDD.scala:160 []\n |  ShuffledRDD[3] at partitionBy at NativeMethodAccessorImpl.java:0 []\n +-(1) PairwiseRDD[2] at groupByKey at /var/folders/58/64zqz_d92v58h90mtvz109r80000gn/T/ipykernel_68977/3735903502.py:11 []\n    |  PythonRDD[1] at groupByKey at /var/folders/58/64zqz_d92v58h90mtvz109r80000gn/T/ipykernel_68977/3735903502.py:11 []\n    |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289 []'


In [8]:
# Create an initial RDD
rdd = sc.parallelize([1, 2, 3, 4, 5])

# Perform some transformations



In [10]:
rdd1 = rdd.map(lambda x: x * 2)  # Multiply each element by 2


In [11]:
rdd2 = rdd1.filter(lambda x: x > 5)  # Keep only elements greater than 5


In [12]:
rdd3 = rdd2.map(lambda x: x - 1)  # Subtract 1 from each element



In [13]:
# Here rdd3 is the final RDD
rdd3.collect()

                                                                                

[5, 7, 9]

In [20]:
from pyspark import SparkContext

sc = SparkContext("local", "LineageExample")

In [None]:
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

import time
def fun(x):
    print(x)
    time.sleep(2)
    return x * x

# Map: square each number
squared = rdd.map(lambda x: fun(x)).collect()
print("Squared:", squared)

1
2Stage 1:>                                                          (0 + 1) / 1]
3
4
ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

5
Exception in thread "serve RDD 1" java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:699)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:743)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)
Exception in thread "serve RDD 3" java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:699)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:743)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerS

In [23]:
import pandas

file='ws-logs_names_email_ip.csv'

df = pandas.read_csv(file)

df.columns

Index(['Time', 'UserId', 'Event context', 'Component', 'Event name',
       'Description', 'Origin', 'IP address'],
      dtype='object')

In [24]:
#remove the last 2 columns

df = df.iloc[:, :-2]

In [25]:
df.columns

Index(['Time', 'UserId', 'Event context', 'Component', 'Event name',
       'Description'],
      dtype='object')

In [26]:
df.head()

Unnamed: 0,Time,UserId,Event context,Component,Event name,Description
0,17/11/24; 00:00:55,user152,Course: Winter School 2024 on Data Systems,System,User profile viewed,The user with id '2141' viewed the profile for...
1,17/11/24; 00:01:07,user152,User: Christopher Lee,System,User profile viewed,The user with id '2141' viewed the profile for...
2,17/11/24; 00:01:10,user083,System,System,User login failed,Login failed for user 'imejia@example.net'. Mo...
3,17/11/24; 00:01:12,user083,Course: Winter School 2024 on Data Systems,System,Course viewed,The user with id '2439' viewed the course with...
4,17/11/24; 00:01:12,user083,System,System,User has logged in,The user with id '2439' has logged in.


In [None]:
df.to_csv('ws-logs_filtered.csv', index=False)

24/12/03 06:08:36 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 213312 ms exceeds timeout 120000 ms
24/12/03 06:08:36 WARN SparkContext: Killing executors is not supported by current scheduler.
24/12/03 06:08:45 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$