Number of Calls and Total Duration

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Initialize SparkSession
spark = SparkSession.builder.appName("CallAggregation").getOrCreate()

In [10]:
data = [
    (10, 20, 58),
    (20, 10, 12),
    (10, 30, 20),
    (30, 40, 100),
    (30, 40, 200),
    (30, 40, 200),
    (40, 30, 500)
]

columns = ["from_id", "to_id", "duration"]
calls_df = spark.createDataFrame(data, columns)

In [11]:
calls_df.show()

+-------+-----+--------+
|from_id|to_id|duration|
+-------+-----+--------+
|     10|   20|      58|
|     20|   10|      12|
|     10|   30|      20|
|     30|   40|     100|
|     30|   40|     200|
|     30|   40|     200|
|     40|   30|     500|
+-------+-----+--------+



In [12]:
calls_df.createOrReplaceTempView("calls")

In [6]:
query = """
WITH StandardizedCalls AS (
    SELECT
        CASE
            WHEN from_id < to_id THEN from_id
            ELSE to_id
        END AS Person1,
        CASE
            WHEN from_id > to_id THEN from_id
            ELSE to_id
        END AS Person2,
        duration
    FROM
        calls
)
SELECT
    Person1,
    Person2,
    COUNT(*) AS call_count,
    SUM(duration) AS total_duration
FROM
    StandardizedCalls
GROUP BY
    Person1,
    Person2
ORDER BY
    Person1,
    Person2
    """

In [7]:
# Execute the query and get the result
result = spark.sql(query)

# Show the result
result.show()

+-------+-------+----------+--------------+
|Person1|Person2|call_count|total_duration|
+-------+-------+----------+--------------+
|     10|     20|         2|            70|
|     10|     30|         1|            20|
|     30|     40|         4|          1000|
+-------+-------+----------+--------------+



using pyspark approach

In [None]:
from pyspark.sql.window import *

In [13]:
swap_df = calls_df.withColumn(
    "Person1", when(col("from_id") < col("to_id"), col("from_id")).otherwise(col("to_id"))
).withColumn(
    "Person2", when(col("from_id") > col("to_id"), col("from_id")).otherwise(col("to_id"))
).select("Person1","Person2","duration")

In [14]:
swap_df.show()

+-------+-------+--------+
|Person1|Person2|duration|
+-------+-------+--------+
|     10|     20|      58|
|     10|     20|      12|
|     10|     30|      20|
|     30|     40|     100|
|     30|     40|     200|
|     30|     40|     200|
|     30|     40|     500|
+-------+-------+--------+



In [17]:
final_df = swap_df.groupBy(col("Person1"),col("Person2"))\
                  .agg(sum(col('duration')).alias('total_duration'),count(col('Person1')).alias('call_count'))\
                  .select('Person1','Person2','call_count','total_duration')

In [18]:
final_df.show()

+-------+-------+----------+--------------+
|Person1|Person2|call_count|total_duration|
+-------+-------+----------+--------------+
|     10|     20|         2|            70|
|     10|     30|         1|            20|
|     30|     40|         4|          1000|
+-------+-------+----------+--------------+

