In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u422-b05-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    udf,
    row_number,
    countDistinct,
    collect_list,
    struct,
    count,
    sum,
    avg,
    expr,
    percentile_approx,
    max as spark_max
)
from pyspark.sql.types import StringType, IntegerType, BinaryType, DoubleType, ArrayType, StructType, StructField
from pyspark.sql import Window
from datetime import datetime
from graphframes import GraphFrame
from scipy.sparse import csr_matrix, vstack, hstack
from pyspark.sql.functions import least, greatest, col
import numpy as np
import pickle
import base64

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, least, greatest, udf, countDistinct, row_number, collect_list, struct
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql import Window
from graphframes import GraphFrame
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .master("local[*]") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "20G") \
    .config("spark.driver.memory", "50G") \
    .config("spark.executor.memoryOverhead", "1G") \
    .config("spark.default.parallelism", "100") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.driver.maxResultSize", "2G") \
    .getOrCreate()

# Optional: Set logging level to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

file_path = '/content/toy_dataset1.csv'  # Adjust this to your file path
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert YYMMDDHHMM to a proper datetime object
def convert_to_datetime(yyMMddHHMM):
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

# Define UDF for calculating duration in minutes
def calculate_duration_minutes(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt
    return duration.total_seconds() / 60

# Register the UDF for duration in minutes
calculate_duration_minutes_udf = udf(calculate_duration_minutes, DoubleType())

# Add column for duration in minutes
df = df.withColumn('duration_minutes', calculate_duration_minutes_udf(col('Start_Time'), col('End_Time')))

# Adjust Client1 and Client2 to ensure Client1 is the smaller value and Client2 the larger
df = df.withColumn("Client1_min", least(col("Client1"), col("Client2"))) \
       .withColumn("Client2_max", greatest(col("Client1"), col("Client2"))) \
       .drop("Client1", "Client2") \
       .withColumnRenamed("Client1_min", "Client1") \
       .withColumnRenamed("Client2_max", "Client2")

# Aggregate total duration for each unique pair (Client1, Client2)
df_aggregated = df.groupBy("Client1", "Client2") \
    .agg(F.sum("duration_minutes").alias("total_duration_minutes"))

# Join the aggregated total duration back to the original DataFrame
df = df.drop("duration_minutes") \
       .join(df_aggregated, on=["Client1", "Client2"], how="left")

# Create Graph using GraphFrames for community detection
vertices = df.selectExpr("Client1 as id").union(df.selectExpr("Client2 as id")).distinct()
edges = df.selectExpr("Client1 as src", "Client2 as dst", "total_duration_minutes as weight")

# Cache vertices and edges
vertices.cache()
edges.cache()

# Create a GraphFrame
g = GraphFrame(vertices, edges)

# Find connected components (communities) using GraphFrames
result = g.connectedComponents()

# Create a mapping from original community IDs to sequential ones
community_mapping = result.select("component").distinct() \
    .orderBy("component") \
    .withColumn("new_id", row_number().over(Window.orderBy("component"))) \
    .cache()

# Join the result (community IDs) with the original DataFrame and map to new sequential IDs
df_with_communities = df.join(result, df['Client1'] == result['id'], 'inner') \
    .join(community_mapping, result['component'] == community_mapping['component'], 'inner') \
    .drop(result['id']) \
    .drop(community_mapping['component']) \
    .withColumnRenamed('new_id', 'community_id')

# Calculate the number of unique clients (community size) per community
community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
    .distinct() \
    .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

# Merge the community sizes into the main DataFrame
df_final = df_with_communities.join(community_sizes, 'community_id')

# Create community_members with unique tuples for each community
community_members = df_final.select("community_id", "Client1", "Client2", "total_duration_minutes") \
    .distinct() \
    .groupBy("community_id") \
    .agg(F.collect_list(F.struct(
        F.col("Client1"),
        F.col("Client2"),
        F.col("total_duration_minutes")
    )).alias("members")) \
    .orderBy("community_id")

# Show the final DataFrame with community IDs, duration, and community sizes
print("\nFinal DataFrame with Sequential Community IDs:")
df_final.select(
    'Client1',
    'Client2',
    'Start_Time',
    'End_Time',
    'total_duration_minutes',
    'community_id',
    'community_size'
).orderBy("community_id").show()




Final DataFrame with Sequential Community IDs:
+-------+-------+----------+----------+----------------------+------------+--------------+
|Client1|Client2|Start_Time|  End_Time|total_duration_minutes|community_id|community_size|
+-------+-------+----------+----------+----------------------+------------+--------------+
|      1|      2|2408040000|2408040500|                 420.0|           1|             2|
|      1|      2|2408060000|2408060200|                 420.0|           1|             2|
|      4|      5|2408090000|2408091500|                1260.0|           2|             2|
|      4|      5|2408020000|2408020600|                1260.0|           2|             2|
|      6|      7|2408070000|2408070800|                 480.0|           3|             2|
|      8|      9|2408070000|2408070500|                 480.0|           4|             2|
|      8|      9|2408090000|2408090300|                 480.0|           4|             2|
|     10|     11|2408010000|2408010400|   

In [10]:
# Define community_members to get a list of unique (Client1, Client2, total_duration_minutes) tuples for each community
community_members = df_final.select("community_id", "Client1", "Client2", "total_duration_minutes") \
    .distinct() \
    .groupBy("community_id") \
    .agg(F.collect_list(F.struct(
        F.col("Client1"),
        F.col("Client2"),
        F.col("total_duration_minutes")
    )).alias("members")) \
    .orderBy("community_id")

# Show the list of community members as tuples
print("\nCommunity Members with Sequential IDs:")
community_members.show(truncate=False)



Community Members with Sequential IDs:
+------------+---------------------------------------------------+
|community_id|members                                            |
+------------+---------------------------------------------------+
|1           |[{1, 2, 420.0}]                                    |
|2           |[{4, 5, 1260.0}]                                   |
|3           |[{6, 7, 480.0}]                                    |
|4           |[{8, 9, 480.0}]                                    |
|5           |[{10, 11, 240.0}]                                  |
|6           |[{12, 14, 480.0}, {12, 13, 120.0}, {13, 14, 300.0}]|
+------------+---------------------------------------------------+

