<a href="https://colab.research.google.com/github/BarGinger/DIS-Assignment/blob/main/Src/DIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
!pip install neo4j
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
import csv
from neo4j import GraphDatabase
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, collect_list, countDistinct, struct
from pyspark.sql.types import StringType
from datetime import datetime
from graphframes import GraphFrame
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u422-b05-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [77]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.default.parallelism", "16") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

In [78]:
file_path = '/content/drive/MyDrive/DIS/adjusted_phone_calls.csv' #'../../Data/adjusted_phone_calls.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show(n=20, truncate=30)

+-------+-------+----------+----------+
|Client1|Client2|Start_Time|  End_Time|
+-------+-------+----------+----------+
|      1|      2|2408191800|2408202000|
|      2|      3|2411171605|2411171640|
|      4|      5|2406291350|2406291448|
|      5|      6|2411070022|2411070319|
|      6|      7|2408201805|2408202005|
|      8|      9|2404230735|2404231008|
|     10|     11|2402220756|2402220825|
|     11|     12|2410032109|2410032324|
|     13|     14|2402172235|2402180007|
|     14|     15|2412161605|2412161713|
+-------+-------+----------+----------+



In [79]:
def convert_to_datetime(yyMMddHHMM):
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

In [80]:
def convert_duration_to_DDHHMM(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt

    days = duration.days
    hours, remainder = divmod(duration.seconds, 3600)
    minutes = remainder // 60
    return f'{days:02d}{hours:02d}{minutes:02d}'

In [81]:
# Register the UDF in Spark
convert_duration_udf = udf(convert_duration_to_DDHHMM, StringType())
# Add a column for duration in DDHHMM format
df = df.withColumn('duration_DDHHMM', convert_duration_udf(col('Start_Time'), col('End_Time')))

# Create Graph using GraphFrames for community detection
vertices = df.selectExpr("Client1 as id").union(df.selectExpr("Client2 as id")).distinct()
edges = df.selectExpr("Client1 as src", "Client2 as dst")

vertices.cache()
edges.cache()

DataFrame[src: int, dst: int]

In [82]:
# Create a GraphFrame
g = GraphFrame(vertices, edges)

# Find connected components (communities) using GraphFrames
result = g.connectedComponents()

In [83]:
# Join the result (community IDs) with the original dataframe
df_with_communities = df.join(result, df['Client1'] == result['id'], 'inner').withColumnRenamed('component', 'community_id')

# Calculate the number of unique clients (community size) per community
community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
    .distinct() \
    .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

# Merge the community sizes into the main DataFrame
df_final = df_with_communities.join(community_sizes, 'community_id')

# Get list of tuples for each community member by considering both Client1 and Client2
community_members = df_final.select("community_id", "Client1").union(df_final.select("community_id", "Client2")) \
    .distinct() \
    .groupBy("community_id").agg(collect_list("Client1").alias("members"))

# Show the final DataFrame with community IDs, duration, and community sizes
df_final.select('Client1', 'Client2', 'duration_DDHHMM', 'community_id', 'community_size').show()

# Show the list of community members as tuples
community_members.show(n=20, truncate=30)  # Display first 20 rows, truncating columns to 30 characters wide


# Create the list of lists of tuples for communities
communities = []

# Group by community_id and collect the edges (Client1, Client2, duration_DDHHMM)
grouped_data = df_final.groupBy("community_id").agg(
    collect_list(col("Client1")).alias("Client1"),
    collect_list(col("Client2")).alias("Client2"),
    collect_list(col("duration_DDHHMM")).alias("durations")
).collect()

# Iterate through each row in grouped_data
for row in grouped_data:
    community_id = row['community_id']  # Access community_id
    community_tuples = []
    for client1, client2, duration in zip(row['Client1'], row['Client2'], row['durations']):
        community_tuples.append((client1, client2, duration))  # Create the tuple for each edge
    communities.append(community_tuples)

# Print the formatted output for verification
for idx, community in enumerate(communities, start=1):
    print(f"Community {idx}: {community}")

+-------+-------+---------------+------------+--------------+
|Client1|Client2|duration_DDHHMM|community_id|community_size|
+-------+-------+---------------+------------+--------------+
|      4|      5|         000058|           4|             4|
|      6|      7|         000200|           4|             4|
|      5|      6|         000257|           4|             4|
|     14|     15|         000108|          13|             3|
|     13|     14|         000132|          13|             3|
|      8|      9|         000233|           8|             2|
|     11|     12|         000215|          10|             3|
|     10|     11|         000029|          10|             3|
|      2|      3|         000035|           1|             3|
|      1|      2|         010200|           1|             3|
+-------+-------+---------------+------------+--------------+

+------------+------------+
|community_id|     members|
+------------+------------+
|           4|[5, 4, 6, 7]|
|          13|[14, 