In [72]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
!pip install
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u422-b05-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
[0m

In [73]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import (
    col,
    udf,
    row_number,
    countDistinct,
    collect_list,
    struct,
    count,
    sum,
    avg,
    expr,
    lit,
    percentile_approx,
    max as spark_max,
    explode,
    least,
    greatest
)
from pyspark.sql.types import StringType, IntegerType, BinaryType, DoubleType, ArrayType, StructType, StructField
from pyspark.sql import Window
from datetime import datetime
from graphframes import GraphFrame
from scipy.sparse import csr_matrix, vstack, hstack
import numpy as np
import pickle
import base64
import pandas as pd


In [74]:

# Initialize Spark session
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .master("local[*]") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "20G") \
    .config("spark.driver.memory", "50G") \
    .config("spark.executor.memoryOverhead", "1G") \
    .config("spark.default.parallelism", "100") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.driver.maxResultSize", "2G") \
    .getOrCreate()




In [75]:
# Optional: Set logging level to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

file_path = '/content/toy_dataset_Copy.csv'  # Adjust this to your file path
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert YYMMDDHHMM to a proper datetime object
def convert_to_datetime(yyMMddHHMM):
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

# Define UDF for calculating duration in minutes
def calculate_duration_minutes(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt
    return duration.total_seconds() / 60

# Register the UDF for duration in minutes
calculate_duration_minutes_udf = udf(calculate_duration_minutes, DoubleType())

# Add column for duration in minutes
df = df.withColumn('duration_minutes', calculate_duration_minutes_udf(col('Start_Time'), col('End_Time')))

# Adjust Client1 and Client2 to ensure Client1 is the smaller value and Client2 the larger
df = df.withColumn("Client1_min", least(col("Client1"), col("Client2"))) \
       .withColumn("Client2_max", greatest(col("Client1"), col("Client2"))) \
       .drop("Client1", "Client2") \
       .withColumnRenamed("Client1_min", "Client1") \
       .withColumnRenamed("Client2_max", "Client2")

# Aggregate total duration for each unique pair (Client1, Client2)
df_aggregated = df.groupBy("Client1", "Client2") \
    .agg(F.sum("duration_minutes").alias("total_duration_minutes"))

# Join the aggregated total duration back to the original DataFrame
df = df.drop("duration_minutes") \
       .join(df_aggregated, on=["Client1", "Client2"], how="left")

# Create Graph using GraphFrames for community detection
vertices = df.selectExpr("Client1 as id").union(df.selectExpr("Client2 as id")).distinct()
edges = df.selectExpr("Client1 as src", "Client2 as dst", "total_duration_minutes as weight")

# Cache vertices and edges
vertices.cache()
edges.cache()

# Create a GraphFrame
g = GraphFrame(vertices, edges)

# Find connected components (communities) using GraphFrames
result = g.connectedComponents()

# Create a mapping from original community IDs to sequential ones
community_mapping = result.select("component").distinct() \
    .orderBy("component") \
    .withColumn("new_id", row_number().over(Window.orderBy("component"))) \
    .cache()

# Join the result (community IDs) with the original DataFrame and map to new sequential IDs
df_with_communities = df.join(result, df['Client1'] == result['id'], 'inner') \
    .join(community_mapping, result['component'] == community_mapping['component'], 'inner') \
    .drop(result['id']) \
    .drop(community_mapping['component']) \
    .withColumnRenamed('new_id', 'community_id')

# Calculate the number of unique clients (community size) per community
community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
    .distinct() \
    .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

# Merge the community sizes into the main DataFrame
df_final = df_with_communities.join(community_sizes, 'community_id')

# Create community_members with unique tuples for each community
community_members = df_final.select("community_id", "Client1", "Client2", "total_duration_minutes") \
    .distinct() \
    .groupBy("community_id") \
    .agg(F.collect_list(F.struct(
        F.col("Client1"),
        F.col("Client2"),
        F.col("total_duration_minutes")
    )).alias("members")) \
    .orderBy("community_id")

# Show the final DataFrame with community IDs, duration, and community sizes
print("\nFinal DataFrame with Sequential Community IDs:")
df_final.select(
    'Client1',
    'Client2',
    'Start_Time',
    'End_Time',
    'total_duration_minutes',
    'community_id',
    'community_size'
).orderBy("community_id").show()




Final DataFrame with Sequential Community IDs:
+-------+-------+----------+----------+----------------------+------------+--------------+
|Client1|Client2|Start_Time|  End_Time|total_duration_minutes|community_id|community_size|
+-------+-------+----------+----------+----------------------+------------+--------------+
|      1|      2|2408060000|2408060200|                 420.0|           1|             3|
|      2|      3|2408040000|2408040500|                 300.0|           1|             3|
|      1|      2|2408040000|2408040500|                 420.0|           1|             3|
|      4|      5|2408020000|2408020600|                 360.0|           2|             4|
|      5|      6|2408090000|2408091500|                 900.0|           2|             4|
|      6|      7|2408070000|2408070800|                 480.0|           2|             4|
|      8|      9|2408020000|2408020600|                 360.0|           3|             4|
|      9|     10|2408090000|2408091500|   

In [76]:
# # Define community_members to get a list of unique (Client1, Client2, total_duration_minutes) tuples for each community
# community_members = df_final.select("community_id", "Client1", "Client2", "total_duration_minutes") \
#     .distinct() \
#     .orderBy("community_id", "total_duration_minutes") \
#     .groupBy("community_id") \
#     .agg(F.collect_list(F.struct(
#         F.col("Client1"),
#         F.col("Client2"),
#         F.col("total_duration_minutes")
#     )).alias("members")) \
#     .orderBy("community_id")
# Define community_members to get a list of unique (Client1, Client2, total_duration_minutes) tuples for each community
community_members = df_final.select("community_id", "Client1", "Client2", "total_duration_minutes") \
    .distinct() \
    .orderBy("Client1") \
    .groupBy("community_id") \
    .agg(F.collect_list(F.struct(
        F.col("Client1"),
        F.col("Client2"),
        F.col("total_duration_minutes")
    )).alias("members")) \
    .orderBy("community_id")
# Show the list of community members as tuples
print("\nCommunity Members with Sequential IDs:")
community_members.show(truncate=False)


Community Members with Sequential IDs:
+------------+---------------------------------------------------+
|community_id|members                                            |
+------------+---------------------------------------------------+
|1           |[{1, 2, 420.0}, {2, 3, 300.0}]                     |
|2           |[{4, 5, 360.0}, {5, 6, 900.0}, {6, 7, 480.0}]      |
|3           |[{8, 9, 360.0}, {9, 10, 900.0}, {10, 11, 480.0}]   |
|4           |[{12, 13, 120.0}, {12, 14, 480.0}, {13, 14, 300.0}]|
+------------+---------------------------------------------------+



Similarity computation

In [99]:
from pyspark.sql.functions import pandas_udf, PandasUDFType, col, explode, struct
from pyspark.sql.types import BinaryType, StructType, StructField, IntegerType
from scipy.sparse import csr_matrix
import pandas as pd
import pickle
import numpy as np
'''Decorator and Function Definition:
The @pandas_udf decorator marks this function as a Pandas UDF (User Defined Function) that will be applied on grouped data.
GROUPED_MAP tells Spark that the function will receive a DataFrame for each group (grouped by community_id).
The schema defines the expected output structure of the function, which is a DataFrame with community_id
and a binary field containing the serialized matrix.
The function converts the connections (edges) between clients into a CSR matrix and serializes it for storage.'''

# Define the schema for the Pandas UDF output
schema = StructType([
    StructField("community_id", IntegerType(), True),
    StructField("csr_matrix", BinaryType(), True)
])

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def create_csr_matrix_from_edges(members_df):
    """
    Creates a serialized CSR matrix from a Spark DataFrame for each community.

    Args:
        members_df: Spark DataFrame with 'community_id' and 'members' columns.

    Returns:
        DataFrame with 'community_id' and a serialized CSR matrix as binary data.
    """

    # Extract the community ID (assuming it's consistent within the group)
    community_id = members_df['community_id'].iloc[0]
    '''Since each members_df contains data for a single community (due to groupBy operation),
    the function retrieves the community_id from the first row.
    This ID will be included in the output so that each serialized CSR matrix can be linked back
    to its respective community.'''
    # Explode the members array to get each connection in separate rows
    exploded_df = members_df.explode("members").dropna().reset_index(drop=True)
    exploded_df = pd.DataFrame({
        'Client1': exploded_df['members'].apply(lambda x: x['Client1']),
        'Client2': exploded_df['members'].apply(lambda x: x['Client2']),
        'total_duration_minutes': exploded_df['members'].apply(lambda x: x['total_duration_minutes'])
    })
    '''Flattening and Extracting Connection Data:
    The members_df contains a column with a list of connections (pairs of clients and call durations).
    The function uses explode to convert this list into individual rows, making it easier to work with each connection.
    It then creates a new DataFrame, exploded_df, with separate columns for Client1, Client2, and duration_minutes
    extracted from the connection data.
    This simplifies further processing by ensuring each row represents a single call between two clients.'''
    # Get unique clients and create a mapping to indices
    unique_clients = sorted(pd.concat([exploded_df['Client1'], exploded_df['Client2']]).unique())
    client_to_index = {client: i for i, client in enumerate(unique_clients)}
    num_clients = len(unique_clients)

    # Extract data for CSR matrix
    rows = exploded_df['Client1'].map(client_to_index).values
    cols = exploded_df['Client2'].map(client_to_index).values
    if weight:
      data = exploded_df['total_duration_minutes'].values #if weight else [1] * len(rows)
    else:
      data = [1] * len(rows)

    # Create CSR matrix
    csr = csr_matrix((data, (rows, cols)), shape=(num_clients, num_clients))
    '''Serializing the CSR Matrix: The function uses Python’s pickle module to serialize the CSR matrix.
    This converts the matrix into a binary format, allowing it to be stored or transferred efficiently.
    Serialization is necessary because Spark DataFrames cannot directly store complex Python objects like CSR matrices.'''
    # Serialize CSR matrix to binary format
    serialized_csr = pickle.dumps(csr)

    # Return as DataFrame
    return pd.DataFrame({"community_id": [community_id], "csr_matrix": [serialized_csr]})
weight=True
# Use the function to generate a serialized CSR matrix for each community and show the results
result_true = community_members.groupBy("community_id").apply(create_csr_matrix_from_edges)
weight=False
result_false = community_members.groupBy("community_id").apply(create_csr_matrix_from_edges)

In [100]:
def pretty_print_csr_matrix(csr_matrix_result, weight=True):
    """Prints a CSR matrix in a readable format."""
    rows, cols = csr_matrix_result.nonzero()
    data = csr_matrix_result.data

    df = pd.DataFrame({
        'Row': rows,
        'Col': cols,
        'Value': data
    })

    print(df)

# Deserialize and print CSR matrices for each community
for row in result_true.collect():
    community_id = row['community_id']
    serialized_csr = row['csr_matrix']

    # Deserialize the binary data back to a CSR matrix
    csr_matrix_result = pickle.loads(serialized_csr)

    print(f"Community ID: {community_id}")
    pretty_print_csr_matrix(csr_matrix_result)
    print("-" * 40)

Community ID: 1
   Row  Col  Value
0    0    1  420.0
1    1    2  300.0
----------------------------------------
Community ID: 2
   Row  Col  Value
0    0    1  360.0
1    1    2  900.0
2    2    3  480.0
----------------------------------------
Community ID: 3
   Row  Col  Value
0    0    1  360.0
1    1    2  900.0
2    2    3  480.0
----------------------------------------
Community ID: 4
   Row  Col  Value
0    0    1  120.0
1    0    2  480.0
2    1    2  300.0
----------------------------------------


In [101]:
# Padding and calculating DeltaCon similarity
def pad_csr_matrix(csr, max_shape):
    current_rows, current_cols = csr.shape
    max_rows, max_cols = max_shape
    if current_rows < max_rows:
        additional_rows = csr_matrix((max_rows - current_rows, current_cols))
        csr = vstack([csr, additional_rows])
    if current_cols < max_cols:
        additional_cols = csr_matrix((csr.shape[0], max_cols - current_cols))
        csr = hstack([csr, additional_cols])
    return csr

In [102]:
def process_csr_matrices(df, max_size):
    def pad_and_calculate(row):
        csr_matrix_padded = pad_csr_matrix(pickle.loads(row['csr_matrix']), max_size)
        serialized_csr = pickle.dumps(csr_matrix_padded)
        return (row['community_id'], serialized_csr)

    return df.rdd.map(pad_and_calculate).toDF(["community_id", "csr_matrix"])
padded_result_true = process_csr_matrices(result_true, max_size)
padded_result_false = process_csr_matrices(result_false, max_size)


In [103]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
import pickle
from scipy.sparse import csr_matrix
import numpy as np

def normalize_matrix(matrix):
    """
    Normalize the matrix values to the range [0, 1].

    Parameters:
    matrix : csr_matrix
        Sparse matrix to normalize.

    Returns:
    csr_matrix
        Normalized sparse matrix.
    """
    data = matrix.data
    if len(data) == 0:  # Handle empty matrices
        return matrix
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val) if max_val > min_val else data
    return matrix.__class__((normalized_data, matrix.indices, matrix.indptr), shape=matrix.shape)

def frobenius_norm(csr_1, csr_2):
    """
    Compute Frobenius norm between two sparse matrices.

    Parameters:
    csr_1, csr_2 : csr_matrix
        Sparse adjacency matrices of the graphs.

    Returns:
    float
        Frobenius norm distance between the graphs.
    """
    # csr_1 = log_transform_matrix(csr_1)
    # csr_2 = log_transform_matrix(csr_2)
    csr_1 = normalize_matrix(csr_1)
    csr_2 = normalize_matrix(csr_2)
    assert csr_1.shape == csr_2.shape, "Adjacency matrices must have the same dimensions."
    diff = csr_1 - csr_2
    return np.sqrt((diff.power(2)).sum())

from pyspark.sql import DataFrame
from pyspark.sql.functions import col

def frobenius_sim(csr_1, csr_2):
    """
    Adds a similarity column to the DataFrame based on Frobenius distance.

    Parameters:
    df (DataFrame): Input DataFrame containing 'frobenius_distance' column.

    Returns:
    DataFrame: A DataFrame with an additional 'similarity' column.
    """
    dist=frobenius_norm(csr_1, csr_2)
    return 1 / (1 + dist)

In [83]:
from scipy.sparse.linalg import inv
from scipy.sparse import identity
def deltacon_similarity(csr_1, csr_2, epsilon=0.5):
    # Ensure both matrices are of the same size
    assert csr_1.shape == csr_2.shape, "Adjacency matrices must be of the same size for comparison."
    I = identity(csr_1.shape[0])
    D1 = csr_1.sum(axis=1).A.flatten()
    D1 = csr_matrix((D1, (range(csr_1.shape[0]), range(csr_1.shape[0]))))
    D2 = csr_2.sum(axis=1).A.flatten()
    D2 = csr_matrix((D2, (range(csr_2.shape[0]), range(csr_2.shape[0]))))

    S1 = inv(I + epsilon**2 * D1 - epsilon * csr_1)
    S2 = inv(I + epsilon**2 * D2 - epsilon * csr_2)
    frobenius_norm = np.sqrt(((S1 - S2).power(2)).sum())
    return 1 / (1 + frobenius_norm)
max_size = result.rdd.map(lambda row: pickle.loads(row['csr_matrix']).shape).reduce(lambda x, y: (max(x[0], y[0]), max(x[1], y[1])))

In [97]:
from sklearn.metrics.pairwise import cosine_similarity
# Comparison function for structural and weight-based similarities
def cosine_sim(csr_1, csr_2):
    # Compute cosine similarity
    cosine_sim = cosine_similarity(csr_1, csr_2)
    return cosine_sim

In [118]:
# Rename columns from df2 to remove ambiguity
padded_result_true_renamed = padded_result_true.select(
    col("community_id").alias("community_id_2"),
    col("csr_matrix").alias("csr_matrix_2")
)

# Perform a cross join to compare every community to every other community
cross_joined_df = padded_result_true.alias("df1").crossJoin(padded_result_true_renamed.alias("df2")) \
    .filter(col("df1.community_id") < col("df2.community_id_2"))

# Define the schema for the Pandas UDF output
similarity_schema = StructType([
    StructField("community_id_1", IntegerType(), True),
    StructField("community_id_2", IntegerType(), True),
    StructField("frobenius_similarity", DoubleType(), True)
])

# Define the function to calculate Frobenius similarity
def calculate_similarity(grouped_df):
    """
    Computes the Frobenius similarity between the csr_matrices of two communities.
    """
    community_id_1 = grouped_df.iloc[0]['community_id']
    community_id_2 = grouped_df.iloc[0]['community_id_2']

    csr_1 = pickle.loads(grouped_df.iloc[0]['csr_matrix'])
    csr_2 = pickle.loads(grouped_df.iloc[0]['csr_matrix_2'])

    similarity_score_f = frobenius_sim(csr_1, csr_2)

    # Return a DataFrame with the results
    return pd.DataFrame([{
        "community_id_1": community_id_1,
        "community_id_2": community_id_2,
        "frobenius_similarity": similarity_score_f
    }])

# Apply the similarity calculation using applyInPandas
similarity_df = cross_joined_df.select("df1.community_id", "df2.community_id_2", "df1.csr_matrix", "df2.csr_matrix_2") \
    .groupBy("community_id", "community_id_2") \
    .applyInPandas(calculate_similarity, schema=similarity_schema)

# Show the results
similarity_df.show(truncate=False)


+--------------+--------------+--------------------+
|community_id_1|community_id_2|frobenius_similarity|
+--------------+--------------+--------------------+
|1             |2             |0.4112575122414839  |
|1             |3             |0.4112575122414839  |
|1             |4             |0.4                 |
|2             |3             |1.0                 |
|2             |4             |0.4673105310958499  |
|3             |4             |0.4673105310958499  |
+--------------+--------------+--------------------+



In [122]:
# Rename columns from df2 to remove ambiguity
padded_result_false_renamed = padded_result_false.select(
    col("community_id").alias("community_id_2"),
    col("csr_matrix").alias("csr_matrix_2")
)

# Perform a cross join to compare every community to every other community
cross_joined_df = padded_result_false.alias("df1").crossJoin(padded_result_false_renamed.alias("df2")) \
    .filter(col("df1.community_id") < col("df2.community_id_2"))

# Define the schema for the Pandas UDF output
similarity_schema = StructType([
    StructField("community_id_1", IntegerType(), True),
    StructField("community_id_2", IntegerType(), True),
    StructField("deltacon", DoubleType(), True)
])

# Define the function to calculate Frobenius similarity
def calculate_similarity(grouped_df):
    """
    Computes the Frobenius similarity between the csr_matrices of two communities.
    """
    community_id_1 = grouped_df.iloc[0]['community_id']
    community_id_2 = grouped_df.iloc[0]['community_id_2']

    csr_1 = pickle.loads(grouped_df.iloc[0]['csr_matrix'])
    csr_2 = pickle.loads(grouped_df.iloc[0]['csr_matrix_2'])

    similarity_score_d = deltacon_similarity(csr_1, csr_2)

    # Return a DataFrame with the results
    return pd.DataFrame([{
        "community_id_1": community_id_1,
        "community_id_2": community_id_2,
        "deltacon": similarity_score_d
    }])

# Apply the similarity calculation using applyInPandas
similarity_df = cross_joined_df.select("df1.community_id", "df2.community_id_2", "df1.csr_matrix", "df2.csr_matrix_2") \
    .groupBy("community_id", "community_id_2") \
    .applyInPandas(calculate_similarity, schema=similarity_schema)

# Show the results
similarity_df.show(truncate=False)

+--------------+--------------+------------------+
|community_id_1|community_id_2|deltacon          |
+--------------+--------------+------------------+
|1             |2             |0.6725177100028019|
|1             |3             |0.6725177100028019|
|1             |4             |0.7470353885783044|
|2             |3             |1.0               |
|2             |4             |0.6213353259709683|
|3             |4             |0.6213353259709683|
+--------------+--------------+------------------+



In [128]:
from pyspark.sql.functions import col, expr
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
import pandas as pd
import pickle

# Step 1: Compute Frobenius Similarity (using padded_result_true)

# Rename columns from df2 to remove ambiguity for Frobenius similarity calculation
padded_result_true_renamed = padded_result_true.select(
    col("community_id").alias("community_id_2"),
    col("csr_matrix").alias("csr_matrix_2")
)

# Cross join to compare every community for Frobenius similarity
cross_joined_df_frobenius = padded_result_true.alias("df1").crossJoin(padded_result_true_renamed.alias("df2")) \
    .filter(col("df1.community_id") < col("df2.community_id_2"))

# Define schema for Frobenius similarity output
frobenius_similarity_schema = StructType([
    StructField("community_id_1", IntegerType(), True),
    StructField("community_id_2", IntegerType(), True),
    StructField("frobenius_similarity", DoubleType(), True)
])

# Define function to calculate Frobenius similarity
def calculate_frobenius_similarity(grouped_df):
    community_id_1 = grouped_df.iloc[0]['community_id']
    community_id_2 = grouped_df.iloc[0]['community_id_2']
    csr_1 = pickle.loads(grouped_df.iloc[0]['csr_matrix'])
    csr_2 = pickle.loads(grouped_df.iloc[0]['csr_matrix_2'])
    similarity_score_f = frobenius_sim(csr_1, csr_2)
    return pd.DataFrame([{
        "community_id_1": community_id_1,
        "community_id_2": community_id_2,
        "frobenius_similarity": similarity_score_f
    }])

# Apply Frobenius similarity calculation
frobenius_similarity_df = cross_joined_df_frobenius.select(
    "df1.community_id", "df2.community_id_2", "df1.csr_matrix", "df2.csr_matrix_2"
).groupBy("community_id", "community_id_2") \
    .applyInPandas(calculate_frobenius_similarity, schema=frobenius_similarity_schema)


# Step 2: Compute DeltaCon Similarity (using padded_result_false)

# Rename columns from df2 to remove ambiguity for DeltaCon similarity calculation
padded_result_false_renamed = padded_result_false.select(
    col("community_id").alias("community_id_2"),
    col("csr_matrix").alias("csr_matrix_2")
)

# Cross join to compare every community for DeltaCon similarity
cross_joined_df_deltacon = padded_result_false.alias("df1").crossJoin(padded_result_false_renamed.alias("df2")) \
    .filter(col("df1.community_id") < col("df2.community_id_2"))

# Define schema for DeltaCon similarity output
deltacon_similarity_schema = StructType([
    StructField("community_id_1", IntegerType(), True),
    StructField("community_id_2", IntegerType(), True),
    StructField("deltacon", DoubleType(), True)
])

# Define function to calculate DeltaCon similarity
def calculate_deltacon_similarity(grouped_df):
    community_id_1 = grouped_df.iloc[0]['community_id']
    community_id_2 = grouped_df.iloc[0]['community_id_2']
    csr_1 = pickle.loads(grouped_df.iloc[0]['csr_matrix'])
    csr_2 = pickle.loads(grouped_df.iloc[0]['csr_matrix_2'])
    similarity_score_d = deltacon_similarity(csr_1, csr_2)
    return pd.DataFrame([{
        "community_id_1": community_id_1,
        "community_id_2": community_id_2,
        "deltacon": similarity_score_d
    }])

# Apply DeltaCon similarity calculation
deltacon_similarity_df = cross_joined_df_deltacon.select(
    "df1.community_id", "df2.community_id_2", "df1.csr_matrix", "df2.csr_matrix_2"
).groupBy("community_id", "community_id_2") \
    .applyInPandas(calculate_deltacon_similarity, schema=deltacon_similarity_schema)


# Step 3: Join Results and Calculate Final Similarity Score

# Join the Frobenius and DeltaCon similarity DataFrames
combined_similarity_df = frobenius_similarity_df.join(
    deltacon_similarity_df,
    on=["community_id_1", "community_id_2"],
    how="inner"
)

# Calculate the final similarity score as an average of Frobenius and DeltaCon similarities
final_similarity_df = combined_similarity_df.withColumn(
    "final_similarity",
    expr("0.5 * frobenius_similarity + 0.5 * deltacon")
)

# Show the final results
final_similarity_df.show(truncate=False)


+--------------+--------------+--------------------+------------------+------------------+
|community_id_1|community_id_2|frobenius_similarity|deltacon          |final_similarity  |
+--------------+--------------+--------------------+------------------+------------------+
|1             |2             |0.4112575122414839  |0.6725177100028019|0.5418876111221429|
|1             |3             |0.4112575122414839  |0.6725177100028019|0.5418876111221429|
|1             |4             |0.4                 |0.7470353885783044|0.5735176942891522|
|2             |3             |1.0                 |1.0               |1.0               |
|2             |4             |0.4673105310958499  |0.6213353259709683|0.5443229285334091|
|3             |4             |0.4673105310958499  |0.6213353259709683|0.5443229285334091|
+--------------+--------------+--------------------+------------------+------------------+



In [130]:
# Set the overall similarity threshold
similarity_threshold = 0.55

# Filter pairs with an overall similarity above the threshold
similar_pairs = final_similarity_df.filter(F.col("final_similarity") >= similarity_threshold)

# Create vertices (unique community IDs) and edges (pairs with similarity above threshold)
vertices = similar_pairs.select("community_id_1").union(similar_pairs.select("community_id_2")).distinct() \
    .withColumnRenamed("community_id_1", "id")

edges = similar_pairs.select(
    F.col("community_id_1").alias("src"),
    F.col("community_id_2").alias("dst")
)

# Build the GraphFrame for community grouping
g = GraphFrame(vertices, edges)

# Find connected components (clusters of communities)
connected_components = g.connectedComponents()

# Group communities by connected component (cluster)
grouped_communities = connected_components.groupBy("component").agg(F.collect_list("id").alias("community_group"))

# Show the clustered communities based on the similarity threshold
print("\nGrouped Communities Based on Similarity Threshold:")
grouped_communities.show(truncate=False)




Grouped Communities Based on Similarity Threshold:
+---------+---------------+
|component|community_group|
+---------+---------------+
|2        |[2, 3]         |
|1        |[1, 4]         |
+---------+---------------+

