In [1]:
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
from collections import Counter
from operator import add

# Assuming logs is an RDD where each element is a tuple (Uid, Errors)
logs = sc.parallelize([
    (1, ['a', 'b', 'a', 'j', 'e', 'c', 'e', 'k' ,'j']),
    (2, ['a', 'd']),
    (1, ['a', 'b', 'e']),
    (3, ['a', 'a','a','a','a', 'c', 'b'])
])

# Map phase: emit (Uid, (error, 1)) for each error
mapped = logs.flatMap(lambda x: [(x[0], i) for i in x[1]])

# Reduce phase: sum up the counts for each (Uid, error)
reduced = mapped.map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)

# For each Uid, find the top 2 most frequent errors
result = reduced.map(lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey().mapValues(lambda x: [i[0] for i in Counter(dict(x)).most_common(2)])

result.collect()


[(1, ['a', 'e']), (2, ['a', 'd']), (3, ['a', 'c'])]

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

spark = SparkSession.builder.getOrCreate()

# Assuming df is a DataFrame where each row is (Uid, Errors)
data = [(1, ['a', 'b', 'a', 'j', 'e', 'c', 'e', 'k' ,'j']),
        (2, ['a', 'd']),
        (1, ['a', 'b', 'e']),
        (3, ['a', 'a','a','a','a', 'c', 'b'])]

df = spark.createDataFrame(data, ["Uid", "Errors"])

# Explode the Errors array into multiple rows
df = df.select("Uid", explode(col("Errors")).alias("Error"))

# Count the occurrences of each error for each Uid
df = df.groupBy("Uid", "Error").count()

# For each Uid, find the top 2 most frequent errors
window = Window.partitionBy("Uid").orderBy(col("count").desc())
result = df.withColumn("rank", rank().over(window)).filter(col("rank") <= 2)

result.show()


+---+-----+-----+----+
|Uid|Error|count|rank|
+---+-----+-----+----+
|  1|    a|    3|   1|
|  1|    e|    3|   1|
|  2|    d|    1|   1|
|  2|    a|    1|   1|
|  3|    a|    5|   1|
|  3|    c|    1|   2|
|  3|    b|    1|   2|
+---+-----+-----+----+



In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import coalesce, lit



# Assuming data is a DataFrame where each row is a string "A -> B, C, D"
data = [("A", ["B", "C", "D"]), ("B", ["A", "D"]), ("C", ["A"]), ("D", ["B", "C"])]
df = spark.createDataFrame(data, ["sender", "receivers"])

# Explode the receivers array into multiple rows
df = df.select("sender", explode(col("receivers")).alias("receiver"))

# Count the number of messages sent and received by each user
sent = df.groupBy("sender").count().withColumnRenamed("count", "sent")
received = df.groupBy("receiver").count().withColumnRenamed("count", "received")

# Join the sent and received DataFrames and compute the difference
result = sent.join(received, sent.sender == received.receiver, "outer") \
             .select(coalesce(col("sender"), col("receiver")).alias("user"),
                     coalesce(col("sent"), lit(0)).alias("sent"),
                     coalesce(col("received"), lit(0)).alias("received")) \
             .withColumn("difference", col("sent") - col("received"))

# The number of users who sent a message but didn't receive a reply is the number of users with a positive difference
num_users = result.filter(col("difference") > 0).count()

num_users


1

In [4]:
spark.stop()

NameError: name 'spark' is not defined