<a href="https://colab.research.google.com/github/BarGinger/DIS-Assignment/blob/main/Src/DIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install neo4j
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install graphframes
import csv
from neo4j import GraphDatabase
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, collect_list, countDistinct, struct
from pyspark.sql.types import StringType
from datetime import datetime
from graphframes import GraphFrame
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting neo4j
  Downloading neo4j-5.25.0-py3-none-any.whl.metadata (5.7 kB)
Downloading neo4j-5.25.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.6/296.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.25.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.6 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 123622 files and directories currently installed.

In [2]:
import pkg_resources
version = pkg_resources.get_distribution("graphframes").version
print(f"GraphFrames version: {version}")

GraphFrames version: 0.6


In [3]:
# Neo4j Aura credentials
uri = "neo4j+s://1c8010b1.databases.neo4j.io"
user = "neo4j"  # Default username for Aura
password = "VS4XqVmt4_YN6m6_7y5VoVIQCgYghry-sDgXIeiJ7ws"
driver = GraphDatabase.driver(uri, auth=(user, password))

In [4]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PhoneCallsCommunityDetection") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.1-s_2.12") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

# Set a checkpoint directory for Spark
spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoints")

In [5]:
def clear_database():
    with driver.session() as session:
        clear_query = """
        MATCH (n)
        DETACH DELETE n
        """
        session.run(clear_query)

In [6]:
# Function to load CSV data into Neo4j
def load_csv_to_neo4j(csv_file_path):
    with driver.session() as session:
        with open(csv_file_path, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                session.execute_write(
                    create_nodes_and_relationships,
                    row['Client1'],
                    row['Client2'],
                    row['Start_Time'],
                    row['End_Time']
                )

In [7]:
def create_nodes_and_relationships(tx, client1, client2, start_time, end_time):
    query = """
    MERGE (c1:Client {id: $client1})
    MERGE (c2:Client {id: $client2})
    MERGE (c1)-[r:CALL {
        start_time: $start_time,
        end_time: $end_time
    }]->(c2)
    """
    tx.run(query, client1=client1, client2=client2, start_time=start_time, end_time=end_time)

In [8]:
def load_data_from_neo4j():
    query = """
    MATCH (c1:Client)-[r:CALL]->(c2:Client)
    RETURN c1.id AS Client1, c2.id AS Client2, r.start_time AS Start_Time, r.end_time AS End_Time
    """
    with driver.session() as session:
        result = session.run(query)
        records = [record.data() for record in result]

    # Convert to Spark DataFrame
    df = spark.createDataFrame(records)

    return df

In [9]:
def convert_to_datetime(yyMMddHHMM):
    """
    formatter function of time datetime
    """
    return datetime.strptime(str(yyMMddHHMM), '%y%m%d%H%M')

In [10]:
def convert_duration_to_DDHHMM(start_time, end_time):
    start_dt = convert_to_datetime(start_time)
    end_dt = convert_to_datetime(end_time)
    duration = end_dt - start_dt

    days = duration.days
    hours, remainder = divmod(duration.seconds, 3600)
    minutes = remainder // 60
    return f'{days:02d}{hours:02d}{minutes:02d}'

In [11]:
# Register the UDF in Spark
convert_duration_udf = udf(convert_duration_to_DDHHMM, StringType())

In [12]:
# Function to use GraphFrames to detect communities
def detect_communities(df):
    # Add a column for duration in DDHHMM format
    df = df.withColumn('duration_DDHHMM', convert_duration_udf(col('Start_Time'), col('End_Time')))

    # Create Graph using GraphFrames for community detection
    vertices = df.selectExpr("Client1 as id").union(df.selectExpr("Client2 as id")).distinct()
    edges = df.selectExpr("Client1 as src", "Client2 as dst")

    # Cache vertices and edges
    vertices.cache()
    edges.cache()

    # Create a GraphFrame
    g = GraphFrame(vertices, edges)
    print(g)
    # Find connected components (communities) using GraphFrames
    result = g.connectedComponents()
    print(result)
    # Validate result DataFrame
    result.show(truncate=False)

    # Join the result (community IDs) with the original dataframe
    df_with_communities = df.join(result, df['Client1'] == result['id'], 'inner').withColumnRenamed('component', 'community_id')

    # Calculate the number of unique clients (community size) per community
    community_sizes = df_with_communities.select("community_id", "Client1").union(df_with_communities.select("community_id", "Client2")) \
        .distinct() \
        .groupBy("community_id").agg(countDistinct("Client1").alias("community_size"))

    # Merge the community sizes into the main DataFrame
    df_final = df_with_communities.join(community_sizes, 'community_id')

    # Get list of tuples for each community member by considering both Client1 and Client2
    community_members = df_final.select("community_id", "Client1", "Client2", "duration_DDHHMM") \
        .distinct() \
        .groupBy("community_id") \
        .agg(collect_list(struct(col("Client1"), col("Client2"), col("duration_DDHHMM"))).alias("members"))

    return df_final, community_members


In [13]:
def update_neo4j_with_communities(df_final):
    with driver.session() as session:
        for row in df_final.collect():
            session.execute_write(
                update_client_community,
                row['Client1'],
                row['community_id'],
                row['community_size']
            )
            session.execute_write(
                update_client_community,
                row['Client2'],
                row['community_id'],
                row['community_size']
            )

In [14]:
# Function to update community information in Neo4j nodes
def update_client_community(tx, client_id, community_id, community_size):
    query = """
    MATCH (c:Client {id: $client_id})
    SET c.communityId = $community_id, c.communitySize = $community_size
    """
    tx.run(query, client_id=client_id, community_id=str(community_id), community_size=community_size)

In [15]:
def output_full_database():
    with driver.session() as session:
        get_full_table_query = """
        MATCH (c1:Client)-[r:CALL]->(c2:Client)
        RETURN c1.id AS client1, c2.id AS client2, r.start_time AS startTime, r.end_time AS endTime,
               c1.communityId AS communityId, c1.communitySize AS communitySize
        ORDER BY communityId
        """
        result = session.run(get_full_table_query)

        print("Client1 | Client2 | Start Time | End Time | Community ID | Community Size")
        print("----------------------------------------------------------------------------")
        for record in result:
            community_size = record['communitySize'] if record['communitySize'] else 'N/A'
            start_time = record['startTime'] if record['startTime'] else 'N/A'
            end_time = record['endTime'] if record['endTime'] else 'N/A'
            print(f"{record['client1']:7} | {record['client2']:7} | {start_time:10} | {end_time:8} | "
                  f"{record['communityId']:12} | {community_size:14}")

In [16]:
def output_community_members(community_members):
    community_members.show(truncate=False)


In [20]:
csv_file_path = "/content/drive/MyDrive/DIS/adjusted_phone_calls.csv"

# Clear the database
clear_database()

# Load data from CSV into Neo4j
load_csv_to_neo4j(csv_file_path)

# Load data from Neo4j into Spark DataFrame
df = load_data_from_neo4j()

# # Detect communities using Spark GraphFrames
df_final, community_members = detect_communities(df)

# # Update Neo4j nodes with community information
update_neo4j_with_communities(df_final)

# # Output the full database table
output_full_database()


GraphFrame(v:[id: string], e:[src: string, dst: string])
DataFrame[id: string, component: bigint]
+---+-----------+
|id |component  |
+---+-----------+
|1  |0          |
|14 |8589934592 |
|5  |8589934593 |
|12 |17179869184|
|3  |0          |
|8  |17179869186|
|10 |17179869184|
|11 |17179869184|
|6  |8589934593 |
|7  |8589934593 |
|9  |17179869186|
|2  |0          |
|4  |8589934593 |
|13 |8589934592 |
|15 |8589934592 |
+---+-----------+

Client1 | Client2 | Start Time | End Time | Community ID | Community Size
----------------------------------------------------------------------------
1       | 2       | 2408191800 | 2408202000 | 0            |              3
2       | 3       | 2411171605 | 2411171640 | 0            |              3
10      | 11      | 2402220756 | 2402220825 | 17179869184  |              3
11      | 12      | 2410032109 | 2410032324 | 17179869184  |              3
8       | 9       | 2404230735 | 2404231008 | 17179869186  |              2
13      | 14      | 24021722

In [None]:
df.head(n=10)

[Row(Client1='13', Client2='14', End_Time='2402180007', Start_Time='2402172235'),
 Row(Client1='14', Client2='15', End_Time='2412161713', Start_Time='2412161605'),
 Row(Client1='1', Client2='2', End_Time='2408202000', Start_Time='2408191800'),
 Row(Client1='2', Client2='3', End_Time='2411171640', Start_Time='2411171605'),
 Row(Client1='4', Client2='5', End_Time='2406291448', Start_Time='2406291350'),
 Row(Client1='5', Client2='6', End_Time='2411070319', Start_Time='2411070022'),
 Row(Client1='6', Client2='7', End_Time='2408202005', Start_Time='2408201805'),
 Row(Client1='8', Client2='9', End_Time='2404231008', Start_Time='2404230735'),
 Row(Client1='10', Client2='11', End_Time='2402220825', Start_Time='2402220756'),
 Row(Client1='11', Client2='12', End_Time='2410032324', Start_Time='2410032109')]