<a href="https://colab.research.google.com/github/BarGinger/DIS-Assignment/blob/main/Src/dis_notebook_02_11_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    udf,
    row_number,
    countDistinct,
    collect_list,
    struct,
    count,
    sum,
    avg,
    expr,
    percentile_approx,
    max as spark_max,
    explode,
    round
)
from pyspark.sql.types import StringType, IntegerType, BinaryType, DoubleType, ArrayType, StructType, StructField
from pyspark.sql import Window
from datetime import datetime
from graphframes import GraphFrame
from scipy.sparse import csr_matrix, vstack, hstack
import numpy as np
import pandas as pd
import pickle
import base64
from sparkmeasure import StageMetrics # for resources monitoring
from functools import wraps

In [None]:
# Monitor CPU, Memory and running time
def track_stage(stage_name):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            print(f"Starting {stage_name}")
            stagemetrics.begin()  # Begin collecting metrics for this stage

            result = func(*args, **kwargs)  # Run the actual function

            stagemetrics.end()  # Stop collecting metrics for this stage

            # Print or retrieve the metrics summary for the stage
            metrics = stagemetrics.createStageMetricsDF()
            metrics.show(truncate=False)
            print(f"Completed {stage_name}\n")
            return result
        return wrapper
    return decorator

In [None]:
# Utils functions

# Convert YYMMDDHHMM to a proper datetime object
def convert_to_datetime(yyMMddHHMM):
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

# Define UDF for calculating duration in minutes
def calculate_duration_minutes(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt
    return duration.total_seconds() / 60

# Define UDF for calculating duration in DDHHMM format
def calculate_duration_string(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt

    days = duration.days
    hours, remainder = divmod(duration.seconds, 3600)
    minutes = remainder // 60
    return f'{days:02d}{hours:02d}{minutes:02d}'

# prompt: print csr_matrix_result pretty
def pretty_print_csr_matrix(csr_matrix_result):
  """Prints a CSR matrix in a readable format."""

  rows, cols = csr_matrix_result.nonzero()
  data = csr_matrix_result.data

  df = pd.DataFrame({
      'Row': rows,
      'Col': cols,
      'Value': data
  })

  print(df)

def create_csr_matrix_from_edges_with_spark(members_df):
    """
    Creates a CSR matrix from a Spark DataFrame based on unique vertices.

    Args:
        members_df: Spark DataFrame with 'community_id' and 'members' columns.

    Returns:
        A CSR matrix.
    """

    # Explode the members array to get each connection in separate rows
    exploded_df = members_df.select(
        "community_id",
        explode("members").alias("member")
    ).select(
        "community_id",
        col("member.Client1").alias("Client1"),
        col("member.Client2").alias("Client2"),
        col("member.duration_minutes").alias("duration_minutes")
    )

    # Get unique clients and create a mapping to indices
    unique_clients = exploded_df.select("Client1").union(exploded_df.select("Client2")).distinct().rdd.flatMap(lambda x: x).collect()
    client_to_index = {client: i for i, client in enumerate(unique_clients)}
    num_clients = len(unique_clients)

    # Extract data for CSR matrix
    rows = exploded_df.select("Client1").rdd.map(lambda row: client_to_index[row[0]]).collect()
    cols = exploded_df.select("Client2").rdd.map(lambda row: client_to_index[row[0]]).collect()
    data = exploded_df.select("duration_minutes").rdd.flatMap(lambda x: x).collect()

    # Create CSR matrix
    csr = csr_matrix((data, (rows, cols)), shape=(num_clients, num_clients))

    return csr

# create csr matrix from given members list
def create_csr_matrix(members, use_weights=False):
    clients = list(set([member['Client1'] for member in members] + [member['Client2'] for member in members]))
    client_index = {client: idx for idx, client in enumerate(clients)}

    row_indices = []
    col_indices = []
    data = []

    for member in members:
        row_indices.append(client_index[member['Client1']])
        col_indices.append(client_index[member['Client2']])
        if use_weights:
            data.append(float(member['duration_minutes']))  # Use duration in minutes as the weight of the edge
        else:
            data.append(1)  # Use 1 for unweighted similarity

    num_clients = len(clients)
    csr = csr_matrix((data, (row_indices, col_indices)), shape=(num_clients, num_clients))

    # Serialize the CSR matrix
    serialized_csr = base64.b64encode(pickle.dumps(csr)).decode('utf-8')
    return serialized_csr

# compare given two csr matrices (each relating to a community) to get similarity score
def compare_weighted_structural_similarity(csr_matrix_1, csr_matrix_2):
    # Deserialize CSR matrices
    csr_1 = pickle.loads(base64.b64decode(csr_matrix_1))
    csr_2 = pickle.loads(base64.b64decode(csr_matrix_2))


    # Align matrix dimensions to the largest size
    max_rows = max(csr_1.shape[0], csr_2.shape[0])
    max_cols = max(csr_1.shape[1], csr_2.shape[1])

    # Pad csr_1 to match max dimensions
    if csr_1.shape[0] < max_rows or csr_1.shape[1] < max_cols:
        csr_1 = vstack([csr_1, csr_matrix((max_rows - csr_1.shape[0], csr_1.shape[1]))]) if csr_1.shape[0] < max_rows else csr_1
        csr_1 = hstack([csr_1, csr_matrix((csr_1.shape[0], max_cols - csr_1.shape[1]))]) if csr_1.shape[1] < max_cols else csr_1

    # Pad csr_2 to match max dimensions
    if csr_2.shape[0] < max_rows or csr_2.shape[1] < max_cols:
        csr_2 = vstack([csr_2, csr_matrix((max_rows - csr_2.shape[0], csr_2.shape[1]))]) if csr_2.shape[0] < max_rows else csr_2
        csr_2 = hstack([csr_2, csr_matrix((csr_2.shape[0], max_cols - csr_2.shape[1]))]) if csr_2.shape[1] < max_cols else csr_2

    # Calculate structural similarity (e.g., using cosine similarity)
    dot_product = csr_1.multiply(csr_2).sum()
    norm_1 = np.sqrt(csr_1.multiply(csr_1).sum())
    norm_2 = np.sqrt(csr_2.multiply(csr_2).sum())
    similarity = dot_product / (norm_1 * norm_2) if norm_1 != 0 and norm_2 != 0 else 0
    return float(similarity)

In [None]:
# Read the
@track_stage("Stage 1: Reading the calls dataset")
def read_csv_to_dataframe(file_path= 'toy_dataset.csv'):
  """
  Read dataset from given path into a Spark DataFrame.
  Parameters:
    -----------
    file_path : str
        The name of the given dataset (unigrams or bigrams or both).

    Returns:
    --------
    df_dataset : DataFrame
        A DataFrame of calls with the given dataset info.
  """
  df_dataset = spark.read.csv(file_path, header=True, inferSchema=True)

  # convert start - end times to duration
  # 1st Register the UDFs in Spark
  calculate_duration_minutes_udf = udf(calculate_duration_minutes, DoubleType())
  calculate_duration_string_udf = udf(calculate_duration_string, StringType())

  # 2nd use udfs to add columns for duration in minutes and DDHHMM format
  df_dataset = df_dataset.withColumn('duration_minutes', calculate_duration_minutes_udf(col('Start_Time'), col('End_Time')))
  df_dataset = df_dataset.withColumn('duration_DDHHMM', calculate_duration_string_udf(col('Start_Time'), col('End_Time')))

  print("The following dataframe has been read from the CSV file:")
  df_dataset.show()
  return df_dataset

@track_stage("Stage 2: Preprocessing and creating the graph")
def create_graph_from_dataframe(df_dataset):
  """
  Create graph in GraphFrame from the calls in the current dataset.
  Parameters:
    -----------
    df_dataset : DataFrame
        A DataFrame of calls with the given dataset info.

    Returns:
    --------
    df_dataset : DataFrame
        A DataFrame of calls with the given dataset info.
  """

  # Create Graph using GraphFrames for community detection
  vertices = df_dataset.selectExpr("Client1 as id").union(df_dataset.selectExpr("Client2 as id")).distinct()
  edges = df_dataset.selectExpr("Client1 as src", "Client2 as dst", "duration_minutes as weight")

  # Cache vertices and edges
  vertices.cache()
  edges.cache()

  # Create a GraphFrame
  g = GraphFrame(vertices, edges)

  # Find connected components (communities) using GraphFrames
  connected_components_result = g.connectedComponents()

  # Create a mapping from original community IDs to sequential ones
  community_mapping = connected_components_result.select("component").distinct() \
      .orderBy("component") \
      .withColumn("new_id", row_number().over(Window.orderBy("component"))) \
      .cache()

  # Join the result (community IDs) with the original dataframe and map to new sequential IDs
  df_with_communities = df_dataset.join(result, df_dataset['Client1'] == connected_components_result['id'], 'inner') \
      .join(community_mapping, connected_components_result['component'] == community_mapping['component'], 'inner') \
      .drop(connected_components_result['id']) \
      .drop(community_mapping['component']) \
      .withColumnRenamed('new_id', 'community_id')

  # Calculate the number of unique clients (community size) per community
  community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
      .distinct() \
      .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

  # Merge the community sizes into the main DataFrame
  df_final = df_with_communities.join(community_sizes, 'community_id')

  # Get list of tuples for each community member by considering both Client1 and Client2
  community_members = df_final.select("community_id", "Client1", "Client2", "duration_DDHHMM", "duration_minutes") \
      .distinct() \
      .groupBy("community_id") \
      .agg(collect_list(struct(col("Client1"),
                            col("Client2"),
                            col("duration_DDHHMM"),
                            col("duration_minutes"))).alias("members")) \
      .orderBy("community_id")

  # Show the final DataFrame with community IDs, duration, and community sizes
  print("\nFinal DataFrame with Sequential Community IDs:")
  df_final.select('Client1',
                  'Client2',
                  'duration_DDHHMM',
                  'duration_minutes',
                  'community_id',
                  'community_size') \
      .orderBy("community_id") \
      .show()

  # Show the list of community members as tuples
  print("\nCommunity Members with Sequential IDs:")
  community_members.show(truncate=False)

  # Save results to CSV files
  # Save the main analysis results
  df_final.select('Client1',
                  'Client2',
                  'duration_DDHHMM',
                  'duration_minutes',
                  'community_id',
                  'community_size') \
      .orderBy("community_id") \
      .write.mode("overwrite").csv("community_analysis_results")

  # Save community members in a flattened format
  df_final.select('community_id',
                  'Client1',
                  'Client2',
                  'duration_DDHHMM',
                  'duration_minutes') \
      .distinct() \
      .orderBy("community_id") \
      .write.mode("overwrite").csv("community_members_results")

  # Optionally, if you want to save additional community statistics
  community_stats = df_final.groupBy('community_id') \
      .agg(
          countDistinct('Client1', 'Client2').alias('unique_members'),
          count('*').alias('total_calls'),
          sum('duration_minutes').alias('total_duration_minutes'),
          avg('duration_minutes').alias('avg_call_duration'),
          percentile_approx('duration_minutes', 0.25).alias('duration_25th_percentile'),
          percentile_approx('duration_minutes', 0.5).alias('median_call_duration'),
          percentile_approx('duration_minutes', 0.75).alias('duration_75th_percentile')
      ) \
      .orderBy('community_id')

  community_stats.write.mode("overwrite").csv("community_statistics_results")

  print("This is the community stats:")
  community_stats.show(truncate=False)
  return df_final, community_members, community_stats

# Create CSR adjacency matrices for each community and serialize them
@track_stage("Stage 3: Creating CSR matrices")
def format_members_to_csr_matrix(community_members):
  """
  Create CSR adjacency matrices for each community and serialize them.

  Parameters:
    community_members: Dataframe
    A dataframe of a specific community's members
  """
  # Convert the collected list of Row objects to a list of dictionaries before passing to UDF
  schema = StructType([
      StructField("Client1", StringType(), True),
      StructField("Client2", StringType(), True),
      StructField("duration_DDHHMM", StringType(), True),
      StructField("duration_minutes", DoubleType(), True)
  ])
  convert_members_udf = udf(lambda members: [member.asDict() for member in members], ArrayType(schema))
  community_members = community_members.withColumn("members_dict", convert_members_udf(col("members")))
  #Register UDF to create and serialize CSR matrices (both unweighted and weighted)
  create_csr_unweighted_udf = udf(lambda members: create_csr_matrix(members, use_weights=False), StringType())
  create_csr_weighted_udf = udf(lambda members: create_csr_matrix(members, use_weights=True), StringType())

  # Add CSR matrix representations (unweighted and weighted) to each community
  community_members = community_members.withColumn("csr_matrix_unweighted", create_csr_unweighted_udf(col("members_dict")))
  community_members = community_members.withColumn("csr_matrix_weighted", create_csr_weighted_udf(col("members_dict")))

  community_members.show(truncate=False)

  # Print some information about the matrix
  print(f"CSR Matrix shape: {csr_matrix_result.shape}")
  print(f"Number of non-zero elements: {csr_matrix_result.nnz}")
  pretty_print_csr_matrix(csr_matrix_result)

  return community_members

@track_stage("Stage 4: Calculate similarities between communities")
def calculate_similarities(community_members):
  """
  Comparing CSR matrices to detect similarity
  """

  # Register UDF to compare structural similarity
  compare_structural_similarity_udf = udf(lambda csr_1, csr_2: compare_weighted_structural_similarity(csr_1, csr_2), DoubleType())
  compare_weighted_similarity_udf = udf(lambda csr_1, csr_2: compare_weighted_structural_similarity(csr_1, csr_2), DoubleType())

  # Cross join to compare each pair of communities and calculate both similarities
  cross_joined = community_members.alias("a").crossJoin(community_members.alias("b")) \
      .filter(col("a.community_id") < col("b.community_id")) \
      .withColumn("unweighted_similarity_score", compare_structural_similarity_udf(col("a.csr_matrix_unweighted"), col("b.csr_matrix_unweighted"))) \
      .withColumn("weighted_similarity_score", compare_weighted_similarity_udf(col("a.csr_matrix_weighted"), col("b.csr_matrix_weighted")))

  # Add combined similarity score (50/50 importance)
  cross_joined = cross_joined.withColumn("combined_similarity_score",
                                        0.5 * col("unweighted_similarity_score") + 0.5 * col("weighted_similarity_score"))

  # Show the similarity scores between communities
  cross_joined.select(col("a.community_id").alias("community_id_1"),
                      col("b.community_id").alias("community_id_2"),
                      round(col("unweighted_similarity_score"), 2).alias("unweighted_similarity_score"),  # Changed here
                      round(col("weighted_similarity_score"), 2).alias("weighted_similarity_score"),  # Changed here
                      round(col("combined_similarity_score"), 2).alias("combined_similarity_score")) \
        .orderBy(["community_id_1", "community_id_2"]) \
        .show(truncate=False)

  cross_joined.write.mode("overwrite").csv("groups_found")

  return cross_joined

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .master("local[*]") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "20G") \
    .config("spark.driver.memory", "50G") \
    .config("spark.executor.memoryOverhead", "1G") \
    .config("spark.default.parallelism", "100") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.driver.maxResultSize", "2G") \
    .getOrCreate()

# Initialize StageMetrics
stagemetrics = StageMetrics(spark)

# Optional: Set logging level to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

In [None]:
# step 1 - read the dataset
dataset_file_path = 'toy_dataset.csv' # set this variable to desired dataset
df_dataset = read_csv_to_dataframe(dataset_file_path)

# step 2 - preprocess (convert to duartion in min, create grpah, and find commutnies)
df_final, community_members, community_stats = create_graph_from_dataframe(df_dataset)

# step 3 - create CSR matrix for each communite
csr_matrix_result = format_members_to_csr_matrix(community_members)

# step 4 - calc simmulries
cross_joined = calculate_similarities(csr_matrix_result)