In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.6 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package openjdk-8-jre-headless:amd64.
Preparing to unpack .../openjdk-8-jre-headless_8u422-b05-1~22.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u422-b05-1~22.04) ...
Sel

In [42]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, collect_list, countDistinct, row_number, lit, struct, max as spark_max
from pyspark.sql.types import StringType, IntegerType, BinaryType, DoubleType, ArrayType, StructType, StructField
from pyspark.sql import Window
from datetime import datetime
from graphframes import GraphFrame
from scipy.sparse import csr_matrix, vstack, hstack
import numpy as np
import pickle
import base64

In [48]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

file_path = 'adjusted_phone_calls.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert YYMMDDHHMM to a proper datetime object
def convert_to_datetime(yyMMddHHMM):
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

# Define UDF for calculating duration in minutes
def calculate_duration_minutes(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt
    return duration.total_seconds() / 60

# Define UDF for calculating duration in DDHHMM format
def calculate_duration_string(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt

    days = duration.days
    hours, remainder = divmod(duration.seconds, 3600)
    minutes = remainder // 60
    return f'{days:02d}{hours:02d}{minutes:02d}'

# Register the UDF in Spark
calculate_duration_minutes_udf = udf(calculate_duration_minutes, DoubleType())
calculate_duration_string_udf = udf(calculate_duration_string, StringType())

# Add columns for duration in minutes and DDHHMM format
df = df.withColumn('duration_minutes', calculate_duration_minutes_udf(col('Start_Time'), col('End_Time')))
df = df.withColumn('duration_DDHHMM', calculate_duration_string_udf(col('Start_Time'), col('End_Time')))

# Create Graph using GraphFrames for community detection
vertices = df.selectExpr("Client1 as id").union(df.selectExpr("Client2 as id")).distinct()
edges = df.selectExpr("Client1 as src", "Client2 as dst", "duration_minutes as weight")

# Cache vertices and edges
vertices.cache()
edges.cache()

# Create a GraphFrame
g = GraphFrame(vertices, edges)

# Find connected components (communities) using GraphFrames
result = g.connectedComponents()

# Join the result (community IDs) with the original dataframe
df_with_communities = df.join(result, df['Client1'] == result['id'], 'inner').withColumnRenamed('component', 'community_id')

# Calculate the number of unique clients (community size) per community
community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
    .distinct() \
    .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

# Merge the community sizes into the main DataFrame
df_final = df_with_communities.join(community_sizes, 'community_id')
from pyspark.sql.functions import struct, collect_list, col
# Get list of tuples for each community member by considering both Client1 and Client2
community_members = df_final.select("community_id", "Client1", "Client2", "duration_DDHHMM", "duration_minutes") \
    .distinct() \
    .groupBy("community_id") \
    .agg(collect_list(struct(col("Client1"), col("Client2"), col("duration_DDHHMM"), col("duration_minutes"))).alias("members"))
# Show the final DataFrame with community IDs, duration, and community sizes
df_final.select('Client1', 'Client2', 'duration_DDHHMM','duration_minutes', 'community_id', 'community_size').show()

# Show the list of community members as tuples
community_members.show(truncate=False)



+-------+-------+---------------+----------------+------------+--------------+
|Client1|Client2|duration_DDHHMM|duration_minutes|community_id|community_size|
+-------+-------+---------------+----------------+------------+--------------+
|      4|      5|         000058|            58.0|           4|             4|
|      5|      6|         000257|           177.0|           4|             4|
|      6|      7|         000200|           120.0|           4|             4|
|      8|      9|         000233|           153.0|           8|             2|
|     11|     12|         000215|           135.0|          10|             3|
|     10|     11|         000029|            29.0|          10|             3|
|      2|      3|         000035|            35.0|           1|             3|
|      1|      2|         010200|          1560.0|           1|             3|
|     14|     15|         000108|            68.0|          13|             3|
|     13|     14|         000132|            92.0|  

In [49]:
# Create CSR adjacency matrices for each community and serialize them
def create_csr_matrix(members, use_weights=False):
    clients = list(set([member['Client1'] for member in members] + [member['Client2'] for member in members]))
    client_index = {client: idx for idx, client in enumerate(clients)}

    row_indices = []
    col_indices = []
    data = []

    for member in members:
        row_indices.append(client_index[member['Client1']])
        col_indices.append(client_index[member['Client2']])
        if use_weights:
            data.append(float(member['duration_minutes']))  # Use duration in minutes as the weight of the edge
        else:
            data.append(1)  # Use 1 for unweighted similarity

    num_clients = len(clients)
    csr = csr_matrix((data, (row_indices, col_indices)), shape=(num_clients, num_clients))

    # Serialize the CSR matrix
    serialized_csr = base64.b64encode(pickle.dumps(csr)).decode('utf-8')
    return serialized_csr

In [59]:
# Convert the collected list of Row objects to a list of dictionaries before passing to UDF
schema = StructType([
    StructField("Client1", StringType(), True),
    StructField("Client2", StringType(), True),
    StructField("duration_DDHHMM", StringType(), True),
    StructField("duration_minutes", DoubleType(), True)
])
convert_members_udf = udf(lambda members: [member.asDict() for member in members], ArrayType(schema))
community_members = community_members.withColumn("members_dict", convert_members_udf(col("members")))
#Register UDF to create and serialize CSR matrices (both unweighted and weighted)
create_csr_unweighted_udf = udf(lambda members: create_csr_matrix(members, use_weights=False), StringType())
create_csr_weighted_udf = udf(lambda members: create_csr_matrix(members, use_weights=True), StringType())

# Add CSR matrix representations (unweighted and weighted) to each community
community_members = community_members.withColumn("csr_matrix_unweighted", create_csr_unweighted_udf(col("members_dict")))
community_members = community_members.withColumn("csr_matrix_weighted", create_csr_weighted_udf(col("members_dict")))

community_members.show(truncate=False)


+------------+--------------------------------------------------------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [51]:
def compare_weighted_structural_similarity(csr_matrix_1, csr_matrix_2):
    # Deserialize CSR matrices
    csr_1 = pickle.loads(base64.b64decode(csr_matrix_1))
    csr_2 = pickle.loads(base64.b64decode(csr_matrix_2))


    # Align matrix dimensions to the largest size
    max_rows = max(csr_1.shape[0], csr_2.shape[0])
    max_cols = max(csr_1.shape[1], csr_2.shape[1])

    # Pad csr_1 to match max dimensions
    if csr_1.shape[0] < max_rows or csr_1.shape[1] < max_cols:
        csr_1 = vstack([csr_1, csr_matrix((max_rows - csr_1.shape[0], csr_1.shape[1]))]) if csr_1.shape[0] < max_rows else csr_1
        csr_1 = hstack([csr_1, csr_matrix((csr_1.shape[0], max_cols - csr_1.shape[1]))]) if csr_1.shape[1] < max_cols else csr_1

    # Pad csr_2 to match max dimensions
    if csr_2.shape[0] < max_rows or csr_2.shape[1] < max_cols:
        csr_2 = vstack([csr_2, csr_matrix((max_rows - csr_2.shape[0], csr_2.shape[1]))]) if csr_2.shape[0] < max_rows else csr_2
        csr_2 = hstack([csr_2, csr_matrix((csr_2.shape[0], max_cols - csr_2.shape[1]))]) if csr_2.shape[1] < max_cols else csr_2

    # Calculate structural similarity (e.g., using cosine similarity)
    dot_product = csr_1.multiply(csr_2).sum()
    norm_1 = np.sqrt(csr_1.multiply(csr_1).sum())
    norm_2 = np.sqrt(csr_2.multiply(csr_2).sum())
    similarity = dot_product / (norm_1 * norm_2) if norm_1 != 0 and norm_2 != 0 else 0
    return float(similarity)

In [58]:
# Register UDF to compare structural similarity
compare_structural_similarity_udf = udf(lambda csr_1, csr_2: compare_weighted_structural_similarity(csr_1, csr_2), DoubleType())
compare_weighted_similarity_udf = udf(lambda csr_1, csr_2: compare_weighted_structural_similarity(csr_1, csr_2), DoubleType())

# Cross join to compare each pair of communities and calculate both similarities
cross_joined = community_members.alias("a").crossJoin(community_members.alias("b")) \
    .filter(col("a.community_id") < col("b.community_id")) \
    .withColumn("unweighted_similarity_score", compare_structural_similarity_udf(col("a.csr_matrix_unweighted"), col("b.csr_matrix_unweighted"))) \
    .withColumn("weighted_similarity_score", compare_weighted_similarity_udf(col("a.csr_matrix_weighted"), col("b.csr_matrix_weighted")))

# Add combined similarity score (50/50 importance)
cross_joined = cross_joined.withColumn("combined_similarity_score",
                                       0.5 * col("unweighted_similarity_score") + 0.5 * col("weighted_similarity_score"))

# Show the similarity scores between communities
cross_joined.select(col("a.community_id").alias("community_id_1"),
                    col("b.community_id").alias("community_id_2"),
                    "unweighted_similarity_score",
                    "weighted_similarity_score",
                    "combined_similarity_score").show(truncate=False)


+--------------+--------------+---------------------------+-------------------------+-------------------------+
|community_id_1|community_id_2|unweighted_similarity_score|weighted_similarity_score|combined_similarity_score|
+--------------+--------------+---------------------------+-------------------------+-------------------------+
|4             |13            |0.8164965809277259         |0.6853357570839352       |0.7509161690058306       |
|4             |10            |0.8164965809277259         |0.8360072103535584       |0.8262518956406422       |
|4             |8             |0.5773502691896258         |0.26176899616530314      |0.4195596326774645       |
|10            |13            |0.9999999999999998         |0.7500299536866752       |0.8750149768433375       |
|1             |13            |0.9999999999999998         |0.8173061607351331       |0.9086530803675664       |
|1             |4             |0.8164965809277259         |0.27962147262753995      |0.548059026777633  