In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.window import Window
# Initialize Spark Session
spark = SparkSession.builder.appName("SubscriberTable").getOrCreate()

# Define Schema for Subscriber Table
subscriber_schema = StructType([
    StructField("sms_date", DateType(), True),
    StructField("sender", StringType(), True),
    StructField("receiver", StringType(), True),
    StructField("sms_no", IntegerType(), True)
])

# Define Data for Subscriber Table
subscriber_data = [
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Avinash', 'Vibhor', 10),
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Vibhor', 'Avinash', 20),
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Avinash', 'Pawan', 30),
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Pawan', 'Avinash', 20),
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Vibhor', 'Pawan', 5),
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Pawan', 'Vibhor', 8),
    (datetime.strptime('2020-04-01', '%Y-%m-%d'), 'Vibhor', 'Deepak', 50)
]

# Create Subscriber DataFrame
subscriber_df = spark.createDataFrame(subscriber_data, schema=subscriber_schema)

# Show DataFrame
subscriber_df.show()
subscriber_df.createOrReplaceTempView("Subscriber")


+----------+-------+--------+------+
|  sms_date| sender|receiver|sms_no|
+----------+-------+--------+------+
|2020-04-01|Avinash|  Vibhor|    10|
|2020-04-01| Vibhor| Avinash|    20|
|2020-04-01|Avinash|   Pawan|    30|
|2020-04-01|  Pawan| Avinash|    20|
|2020-04-01| Vibhor|   Pawan|     5|
|2020-04-01|  Pawan|  Vibhor|     8|
|2020-04-01| Vibhor|  Deepak|    50|
+----------+-------+--------+------+



In [10]:
spark.sql(
"""
    with cte as (select *,
        Case when sender < receiver then sender else receiver end as p1,
        Case when sender > receiver then sender else receiver end as p2
    from Subscriber)
    
    select sms_date, p1, p2, sum(sms_no) from cte group by sms_date, p1, p2
""").show()

+----------+-------+------+-----------+
|  sms_date|     p1|    p2|sum(sms_no)|
+----------+-------+------+-----------+
|2020-04-01|Avinash|Vibhor|         30|
|2020-04-01|Avinash| Pawan|         50|
|2020-04-01|  Pawan|Vibhor|         13|
|2020-04-01| Deepak|Vibhor|         50|
+----------+-------+------+-----------+



In [12]:
df_cte = subscriber_df.withColumn(
    "p1", when(col("sender") < col("receiver"), col("sender")).otherwise(col("receiver"))
).withColumn(
    "p2", when(col("sender") > col("receiver"), col("sender")).otherwise(col("receiver"))
)

# Group by sms_date, p1, p2 and sum sms_no
df_result = df_cte.groupBy("sms_date", "p1", "p2").agg(sum("sms_no").alias("total_sms"))

# Show results
df_result.show()

+----------+-------+------+---------+
|  sms_date|     p1|    p2|total_sms|
+----------+-------+------+---------+
|2020-04-01|Avinash|Vibhor|       30|
|2020-04-01|Avinash| Pawan|       50|
|2020-04-01|  Pawan|Vibhor|       13|
|2020-04-01| Deepak|Vibhor|       50|
+----------+-------+------+---------+

