<a href="https://colab.research.google.com/github/BarGinger/DIS-Assignment/blob/main/Src/dis_notebook_25_10_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
!pip install
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.6 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package openjdk-8-jre-headless:amd64.
Preparing to unpack .../openjdk-8-jre-headless_8u422-b05-1~22.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u422-b05-1~22.04) ...
Sel

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    udf,
    row_number,
    countDistinct,
    collect_list,
    struct,
    count,
    sum,
    avg,
    expr,
    percentile_approx,
    max as spark_max,
    explode
)
from pyspark.sql.types import StringType, IntegerType, BinaryType, DoubleType, ArrayType, StructType, StructField
from pyspark.sql import Window
from datetime import datetime
from graphframes import GraphFrame
from scipy.sparse import csr_matrix, vstack, hstack
import numpy as np
import pickle
import base64

In [45]:
# Re-initialize Spark session if needed
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .master("local[*]") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "20G") \
    .config("spark.driver.memory", "50G") \
    .config("spark.executor.memoryOverhead", "1G") \
    .config("spark.default.parallelism", "100") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.driver.maxResultSize", "2G") \
    .getOrCreate()
# Optional: Set logging level to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

file_path = 'toy_dataset.csv' #'adjusted_phone_calls.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert YYMMDDHHMM to a proper datetime object
def convert_to_datetime(yyMMddHHMM):
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

# Define UDF for calculating duration in minutes
def calculate_duration_minutes(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt
    return duration.total_seconds() / 60

# Define UDF for calculating duration in DDHHMM format
def calculate_duration_string(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt

    days = duration.days
    hours, remainder = divmod(duration.seconds, 3600)
    minutes = remainder // 60
    return f'{days:02d}{hours:02d}{minutes:02d}'

# Register the UDFs in Spark
calculate_duration_minutes_udf = udf(calculate_duration_minutes, DoubleType())
calculate_duration_string_udf = udf(calculate_duration_string, StringType())

# Add columns for duration in minutes and DDHHMM format
df = df.withColumn('duration_minutes', calculate_duration_minutes_udf(col('Start_Time'), col('End_Time')))
df = df.withColumn('duration_DDHHMM', calculate_duration_string_udf(col('Start_Time'), col('End_Time')))
df.show()

+-------+-------+----------+----------+----------------+---------------+
|Client1|Client2|Start_Time|  End_Time|duration_minutes|duration_DDHHMM|
+-------+-------+----------+----------+----------------+---------------+
|      1|      2|2408060000|2408060200|           120.0|         000200|
|      2|      3|2408040000|2408040500|           300.0|         000500|
|      4|      5|2408020000|2408020600|           360.0|         000600|
|      5|      6|2408090000|2408091500|           900.0|         001500|
|      6|      7|2408070000|2408070800|           480.0|         000800|
|      8|      9|2408090000|2408090300|           180.0|         000300|
|      9|     10|2408070000|2408070500|           300.0|         000500|
|     10|     11|2408010000|2408010400|           240.0|         000400|
|     12|     13|2408010000|2408010200|           120.0|         000200|
|     13|     14|2408030000|2408030500|           300.0|         000500|
|     12|     14|2408020000|2408020800|           4

In [None]:
# Create Graph using GraphFrames for community detection
vertices = df.selectExpr("Client1 as id").union(df.selectExpr("Client2 as id")).distinct()
edges = df.selectExpr("Client1 as src", "Client2 as dst", "duration_minutes as weight")

# Cache vertices and edges
vertices.cache()
edges.cache()

# Create a GraphFrame
g = GraphFrame(vertices, edges)

# Find connected components (communities) using GraphFrames
result = g.connectedComponents()

# Create a mapping from original community IDs to sequential ones
community_mapping = result.select("component").distinct() \
    .orderBy("component") \
    .withColumn("new_id", row_number().over(Window.orderBy("component"))) \
    .cache()

# Join the result (community IDs) with the original dataframe and map to new sequential IDs
df_with_communities = df.join(result, df['Client1'] == result['id'], 'inner') \
    .join(community_mapping, result['component'] == community_mapping['component'], 'inner') \
    .drop(result['id']) \
    .drop(community_mapping['component']) \
    .withColumnRenamed('new_id', 'community_id')

# Calculate the number of unique clients (community size) per community
community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
    .distinct() \
    .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

# Merge the community sizes into the main DataFrame
df_final = df_with_communities.join(community_sizes, 'community_id')

# Get list of tuples for each community member by considering both Client1 and Client2
community_members = df_final.select("community_id", "Client1", "Client2", "duration_DDHHMM", "duration_minutes") \
    .distinct() \
    .groupBy("community_id") \
    .agg(collect_list(struct(col("Client1"),
                           col("Client2"),
                           col("duration_DDHHMM"),
                           col("duration_minutes"))).alias("members")) \
    .orderBy("community_id")

# Show the final DataFrame with community IDs, duration, and community sizes
print("\nFinal DataFrame with Sequential Community IDs:")
df_final.select('Client1',
                'Client2',
                'duration_DDHHMM',
                'duration_minutes',
                'community_id',
                'community_size') \
    .orderBy("community_id") \
    .show()

# Show the list of community members as tuples
print("\nCommunity Members with Sequential IDs:")
community_members.show(truncate=False)

# Save results to CSV files
# Save the main analysis results
df_final.select('Client1',
                'Client2',
                'duration_DDHHMM',
                'duration_minutes',
                'community_id',
                'community_size') \
    .orderBy("community_id") \
    .write.mode("overwrite").csv("community_analysis_results")

# Save community members in a flattened format
df_final.select('community_id',
                'Client1',
                'Client2',
                'duration_DDHHMM',
                'duration_minutes') \
    .distinct() \
    .orderBy("community_id") \
    .write.mode("overwrite").csv("community_members_results")

# Optionally, if you want to save additional community statistics
community_stats = df_final.groupBy('community_id') \
    .agg(
        countDistinct('Client1', 'Client2').alias('unique_members'),
        count('*').alias('total_calls'),
        sum('duration_minutes').alias('total_duration_minutes'),
        avg('duration_minutes').alias('avg_call_duration'),
        percentile_approx('duration_minutes', 0.25).alias('duration_25th_percentile'),
        percentile_approx('duration_minutes', 0.5).alias('median_call_duration'),
        percentile_approx('duration_minutes', 0.75).alias('duration_75th_percentile')
    ) \
    .orderBy('community_id')

community_stats.write.mode("overwrite").csv("community_statistics_results")

In [37]:
from pyspark.sql.functions import pandas_udf, PandasUDFType, col, explode, struct
from pyspark.sql.types import BinaryType, StructType, StructField, IntegerType
from scipy.sparse import csr_matrix
import pandas as pd
import pickle
'''Decorator and Function Definition:
The @pandas_udf decorator marks this function as a Pandas UDF (User Defined Function) that will be applied on grouped data.
GROUPED_MAP tells Spark that the function will receive a DataFrame for each group (grouped by community_id).
The schema defines the expected output structure of the function, which is a DataFrame with community_id
and a binary field containing the serialized matrix.
The function converts the connections (edges) between clients into a CSR matrix and serializes it for storage.'''

# Define the schema for the Pandas UDF output
schema = StructType([
    StructField("community_id", IntegerType(), True),
    StructField("csr_matrix", BinaryType(), True)
])

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def create_csr_matrix_from_edges(members_df):
    """
    Creates a serialized CSR matrix from a Spark DataFrame for each community.

    Args:
        members_df: Spark DataFrame with 'community_id' and 'members' columns.

    Returns:
        DataFrame with 'community_id' and a serialized CSR matrix as binary data.
    """

    # Extract the community ID (assuming it's consistent within the group)
    community_id = members_df['community_id'].iloc[0]
    '''Since each members_df contains data for a single community (due to groupBy operation),
    the function retrieves the community_id from the first row.
    This ID will be included in the output so that each serialized CSR matrix can be linked back
    to its respective community.'''
    # Explode the members array to get each connection in separate rows
    exploded_df = members_df.explode("members").dropna().reset_index(drop=True)
    exploded_df = pd.DataFrame({
        'Client1': exploded_df['members'].apply(lambda x: x['Client1']),
        'Client2': exploded_df['members'].apply(lambda x: x['Client2']),
        'duration_minutes': exploded_df['members'].apply(lambda x: x['duration_minutes'])
    })
    '''Flattening and Extracting Connection Data:
    The members_df contains a column with a list of connections (pairs of clients and call durations).
    The function uses explode to convert this list into individual rows, making it easier to work with each connection.
    It then creates a new DataFrame, exploded_df, with separate columns for Client1, Client2, and duration_minutes
    extracted from the connection data.
    This simplifies further processing by ensuring each row represents a single call between two clients.'''
    # Get unique clients and create a mapping to indices
    unique_clients = pd.concat([exploded_df['Client1'], exploded_df['Client2']]).unique()
    client_to_index = {client: i for i, client in enumerate(unique_clients)}
    num_clients = len(unique_clients)

    # Extract data for CSR matrix
    rows = exploded_df['Client1'].map(client_to_index).values
    cols = exploded_df['Client2'].map(client_to_index).values
    data = exploded_df['duration_minutes'].values #if weight else [1] * len(rows)


    # Create CSR matrix
    csr = csr_matrix((data, (rows, cols)), shape=(num_clients, num_clients))
    '''Serializing the CSR Matrix: The function uses Python’s pickle module to serialize the CSR matrix.
    This converts the matrix into a binary format, allowing it to be stored or transferred efficiently.
    Serialization is necessary because Spark DataFrames cannot directly store complex Python objects like CSR matrices.'''
    # Serialize CSR matrix to binary format
    serialized_csr = pickle.dumps(csr)

    # Return as DataFrame
    return pd.DataFrame({"community_id": [community_id], "csr_matrix": [serialized_csr]})

# Use the function to generate a serialized CSR matrix for each community and show the results
result = community_members.groupBy("community_id").apply(create_csr_matrix_from_edges)
result.show(truncate=False)


+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [33]:
import pandas as pd
import pickle
from scipy.sparse import csr_matrix

def pretty_print_csr_matrix(csr_matrix_result, weight=True):
    """Prints a CSR matrix in a readable format."""
    rows, cols = csr_matrix_result.nonzero()
    data = csr_matrix_result.data

    df = pd.DataFrame({
        'Row': rows,
        'Col': cols,
        'Value': data
    })

    print(df)

# Deserialize and print CSR matrices for each community
for row in result.collect():
    community_id = row['community_id']
    serialized_csr = row['csr_matrix']

    # Deserialize the binary data back to a CSR matrix
    csr_matrix_result = pickle.loads(serialized_csr)

    print(f"Community ID: {community_id}")
    pretty_print_csr_matrix(csr_matrix_result)
    print("-" * 40)


Community ID: 1
   Row  Col  Value
0    0    2  300.0
1    1    0  120.0
----------------------------------------
Community ID: 2
   Row  Col  Value
0    0    3  480.0
1    1    0  900.0
2    2    1  360.0
----------------------------------------
Community ID: 3
   Row  Col  Value
0    0    2  300.0
1    1    0  180.0
2    2    3  240.0
----------------------------------------
Community ID: 4
   Row  Col  Value
0    0    2  300.0
1    1    0  120.0
2    1    2  480.0
----------------------------------------


In [10]:
# Use the function to generate a serialized CSR matrix for each community
result = community_members.groupBy("community_id").apply(create_csr_matrix_from_edges)

# Padding and calculating DeltaCon similarity
def pad_csr_matrix(csr, max_shape):
    current_rows, current_cols = csr.shape
    max_rows, max_cols = max_shape
    if current_rows < max_rows:
        additional_rows = csr_matrix((max_rows - current_rows, current_cols))
        csr = vstack([csr, additional_rows])
    if current_cols < max_cols:
        additional_cols = csr_matrix((csr.shape[0], max_cols - current_cols))
        csr = hstack([csr, additional_cols])
    return csr




Method #1: Deltacon Similarity

In [18]:
from scipy.sparse.linalg import inv
from scipy.sparse import identity
def deltacon_similarity(csr_1, csr_2, epsilon=0.5):
    # Ensure both matrices are of the same size
    assert csr_1.shape == csr_2.shape, "Adjacency matrices must be of the same size for comparison."
    I = identity(csr_1.shape[0])
    D1 = csr_1.sum(axis=1).A.flatten()
    D1 = csr_matrix((D1, (range(csr_1.shape[0]), range(csr_1.shape[0]))))
    D2 = csr_2.sum(axis=1).A.flatten()
    D2 = csr_matrix((D2, (range(csr_2.shape[0]), range(csr_2.shape[0]))))

    S1 = inv(I + epsilon**2 * D1 - epsilon * csr_1)
    S2 = inv(I + epsilon**2 * D2 - epsilon * csr_2)
    frobenius_norm = np.sqrt(((S1 - S2).power(2)).sum())
    return 1 / (1 + frobenius_norm)
max_size = result.rdd.map(lambda row: pickle.loads(row['csr_matrix']).shape).reduce(lambda x, y: (max(x[0], y[0]), max(x[1], y[1])))

In [12]:
# Pad CSR matrices and calculate DeltaCon similarity using Spark DataFrame operations
def process_csr_matrices(df, max_size):
    def pad_and_calculate(row):
        csr_matrix_padded = pad_csr_matrix(pickle.loads(row['csr_matrix']), max_size)
        serialized_csr = pickle.dumps(csr_matrix_padded)
        return (row['community_id'], serialized_csr)

    return df.rdd.map(pad_and_calculate).toDF(["community_id", "csr_matrix"])

In [19]:
padded_result = process_csr_matrices(result, max_size)

# Cross join the DataFrame with itself to calculate DeltaCon similarity for all pairs of communities
cross_joined = padded_result.alias("df1").crossJoin(padded_result.alias("df2")) \
    .filter(col("df1.community_id") < col("df2.community_id"))

# Define a Pandas UDF to calculate similarity for each pair
schema_similarity = StructType([
    StructField("community_id_1", IntegerType(), True),
    StructField("community_id_2", IntegerType(), True),
    StructField("similarity", DoubleType(), True)
])

@pandas_udf(schema_similarity, PandasUDFType.GROUPED_MAP)
def calculate_similarity(df):
    csr_1 = pickle.loads(df['csr_matrix_1'].iloc[0])
    csr_2 = pickle.loads(df['csr_matrix_2'].iloc[0])
    similarity = deltacon_similarity(csr_1, csr_2)
    return pd.DataFrame({"community_id_1": [df['community_id_1'].iloc[0]], "community_id_2": [df['community_id_2'].iloc[0]], "similarity": [similarity]})

cross_joined = cross_joined.select(
    col("df1.community_id").alias("community_id_1"),
    col("df2.community_id").alias("community_id_2"),
    col("df1.csr_matrix").alias("csr_matrix_1"),
    col("df2.csr_matrix").alias("csr_matrix_2")
)

similarities = cross_joined.groupBy("community_id_1", "community_id_2").apply(calculate_similarity)

similarities.show(truncate=False)




+--------------+--------------+-------------------+
|community_id_1|community_id_2|similarity         |
+--------------+--------------+-------------------+
|1             |2             |0.09089789700695985|
|1             |3             |0.09282921066953156|
|1             |4             |0.40870486948985857|
|2             |3             |0.12274808949972305|
|2             |4             |0.09485010059678281|
|3             |4             |0.09696780258452274|
+--------------+--------------+-------------------+



In [27]:
from pyspark.sql import functions as F
# Define the similarity threshold
similarity_threshold = 0.1  # Adjust this threshold as needed

# Filter pairs with similarity above the threshold
similar_pairs = similarities.filter(F.col("similarity") >= similarity_threshold)
similar_pairs.show(truncate=False)
# Create vertices (unique community IDs) and edges (pairs above threshold)
 # Each community is treated as a node, and high similarity as an edge between nodes
vertices = similar_pairs.select("community_id_1").union(similar_pairs.select("community_id_2")).distinct() \
     .withColumnRenamed("community_id_1", "id")
# vertices.show(truncate=False)
edges = similar_pairs.select(F.col("community_id_1").alias("src"), F.col("community_id_2").alias("dst"))

# # Build the GraphFrame for community grouping
g = GraphFrame(vertices, edges)

# # Find connected components (clusters of communities)
connected_components = g.connectedComponents()

# # Group communities by connected component (cluster)
grouped_communities = connected_components.groupBy("component").agg(F.collect_list("id").alias("community_group"))

# # Show the clustered communities
# print("\nGrouped Communities Based on Similarity Threshold:")
grouped_communities.show(truncate=False)


+--------------+--------------+-------------------+
|community_id_1|community_id_2|similarity         |
+--------------+--------------+-------------------+
|1             |4             |0.40870486948985857|
|2             |3             |0.12274808949972305|
+--------------+--------------+-------------------+

+---+
|id |
+---+
|1  |
|2  |
|3  |
|4  |
+---+





+---------+---------------+
|component|community_group|
+---------+---------------+
|2        |[2, 3]         |
|1        |[1, 4]         |
+---------+---------------+



Method #2: 50% structural (with cosine similarity) + 50% weighted (with correlation distance)

Correlation similarity measures the similarity in the patterns of values (weights) between two vectors. When applied to adjacency matrices (or flattened versions representing edges), it captures how the weights on edges change in proportion across two networks, rather than focusing on their exact values. This can be especially useful if you want to see if the networks have similar patterns in edge weights, regardless of their absolute differences.



In [42]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle

# Define schema for similarity results
schema_similarity = StructType([
    StructField("community_id_1", IntegerType(), True),
    StructField("community_id_2", IntegerType(), True),
    StructField("cosine_similarity", DoubleType(), True),
    StructField("correlation_distance", DoubleType(), True),
    StructField("overall_similarity", DoubleType(), True)
])

# Function to calculate correlation distance
def correlation_distance(vec1, vec2):
    if np.std(vec1) == 0 or np.std(vec2) == 0:
        return 1.0  # Max distance if there's no variance
    return 1 - np.corrcoef(vec1, vec2)[0, 1]

# Comparison function for structural and weight-based similarities
def compare_structural_and_weight_only(csr_1, csr_2):
    # Convert CSR matrices to dense vectors
    vec1 = csr_1.toarray().flatten()
    vec2 = csr_2.toarray().flatten()
    binary_1 = (csr_1.toarray() > 0).astype(int).flatten()  # Convert all non-zero entries to 1
    binary_2 = (csr_2.toarray() > 0).astype(int).flatten()

    # Cosine similarity for structural similarity
    cosine_sim = cosine_similarity([binary_1], [binary_2])[0, 0]

    # Weight-only distances
    correlation = correlation_distance(vec1, vec2)

    return cosine_sim, correlation

# Define the Pandas UDF for similarity calculations
@pandas_udf(schema_similarity, PandasUDFType.GROUPED_MAP)
def compute_similarity(df):
    csr_1 = pickle.loads(df['csr_matrix_1'].iloc[0])
    csr_2 = pickle.loads(df['csr_matrix_2'].iloc[0])

    # Calculate cosine similarity and correlation distance
    cosine_sim, correlation = compare_structural_and_weight_only(csr_1, csr_2)

    # Calculate overall similarity directly without additional adjustment
    overall_similarity = 0.5 * cosine_sim + 0.5 * correlation  # Combine for overall similarity

    return pd.DataFrame({
        "community_id_1": [df['community_id_1'].iloc[0]],
        "community_id_2": [df['community_id_2'].iloc[0]],
        "cosine_similarity": [cosine_sim],
        "correlation_distance": [correlation],
        "overall_similarity": [overall_similarity]
    })

# Cross join to get all pairs of communities for comparison
cross_joined = padded_result.alias("df1").crossJoin(padded_result.alias("df2")) \
    .filter(F.col("df1.community_id") < F.col("df2.community_id"))

# Select relevant columns for the UDF
cross_joined = cross_joined.select(
    F.col("df1.community_id").alias("community_id_1"),
    F.col("df2.community_id").alias("community_id_2"),
    F.col("df1.csr_matrix").alias("csr_matrix_1"),
    F.col("df2.csr_matrix").alias("csr_matrix_2")
)

# Apply the similarity calculations
similarities = cross_joined.groupBy("community_id_1", "community_id_2").apply(compute_similarity)

# Show the similarity results including overall similarity
similarities.select("community_id_1", "community_id_2", "cosine_similarity", "correlation_distance", "overall_similarity").show(truncate=False)


+--------------+--------------+------------------+--------------------+------------------+
|community_id_1|community_id_2|cosine_similarity |correlation_distance|overall_similarity|
+--------------+--------------+------------------+--------------------+------------------+
|1             |2             |0.408248290463863 |0.7940497676773866  |0.6011490290706247|
|1             |3             |0.816496580927726 |0.2103743489338583  |0.5134354649307922|
|1             |4             |0.816496580927726 |0.5041289831766382  |0.6603127820521821|
|2             |3             |0.3333333333333334|0.7800127120441522  |0.5566730226887429|
|2             |4             |0.3333333333333334|0.9808213778032979  |0.6570773555683157|
|3             |4             |0.6666666666666669|0.6528369560371887  |0.6597518113519278|
+--------------+--------------+------------------+--------------------+------------------+



In [44]:
from pyspark.sql import functions as F
from graphframes import GraphFrame

# Set the overall similarity threshold
similarity_threshold = 0.66

# Filter pairs with an overall similarity above the threshold
similar_pairs = similarities.filter(F.col("overall_similarity") >= similarity_threshold)

# Create vertices (unique community IDs) and edges (pairs with similarity above threshold)
vertices = similar_pairs.select("community_id_1").union(similar_pairs.select("community_id_2")).distinct() \
    .withColumnRenamed("community_id_1", "id")

edges = similar_pairs.select(
    F.col("community_id_1").alias("src"),
    F.col("community_id_2").alias("dst")
)

# Build the GraphFrame for community grouping
g = GraphFrame(vertices, edges)

# Find connected components (clusters of communities)
connected_components = g.connectedComponents()

# Group communities by connected component (cluster)
grouped_communities = connected_components.groupBy("component").agg(F.collect_list("id").alias("community_group"))

# Show the clustered communities based on the similarity threshold
print("\nGrouped Communities Based on Similarity Threshold:")
grouped_communities.show(truncate=False)



Grouped Communities Based on Similarity Threshold:
+---------+---------------+
|component|community_group|
+---------+---------------+
|1        |[1, 4]         |
+---------+---------------+



In [7]:
community_members.show(truncate=False)

+------------+---------------------------------------------------------------------------+
|community_id|members                                                                    |
+------------+---------------------------------------------------------------------------+
|1           |[{2, 3, 000500, 300.0}, {1, 2, 000200, 120.0}]                             |
|2           |[{6, 7, 000800, 480.0}, {5, 6, 001500, 900.0}, {4, 5, 000600, 360.0}]      |
|3           |[{9, 10, 000500, 300.0}, {8, 9, 000300, 180.0}, {10, 11, 000400, 240.0}]   |
|4           |[{13, 14, 000500, 300.0}, {12, 13, 000200, 120.0}, {12, 14, 000800, 480.0}]|
+------------+---------------------------------------------------------------------------+

