# SetUp

In [27]:
import os
import sys
import glob
from os.path import abspath
os.environ['SPARK_HOME'] = '/home/chiara/Documenti/BigData/CountingTriangles/spark-3.5.0-bin-hadoop3'
os.environ['HADOOP_HOME'] = '/home/chiara/Documenti/BigData/CountingTriangles/spark-3.5.0-bin-hadoop3'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-17-openjdk-amd64'
os.environ['SPARK_LOCAL_IP'] = '172.17.0.1'

In [28]:
spark_python = os.path.join(os.environ.get('SPARK_HOME',None),'python')
py4j = glob.glob(os.path.join(spark_python,'lib','py4j-*.zip'))[0]
graphf = glob.glob(os.path.join(spark_python,'graphframes.zip'))[0]
sparkmeasure = glob.glob(os.path.join(spark_python,'sparkmeasure.zip'))[0]
sys.path[:0]=[spark_python,py4j]
sys.path[:0]=[spark_python,graphf]
sys.path[:0]=[spark_python, sparkmeasure]
os.environ['PYTHONPATH']=py4j+os.pathsep+graphf+sparkmeasure

In [29]:
import findspark
findspark.init()
findspark.find()

'/home/chiara/Documenti/BigData/CountingTriangles/spark-3.5.0-bin-hadoop3'

In [30]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Counting Triangles").enableHiveSupport().getOrCreate()

In [31]:
from graphframes import *
import networkx as nx
import matplotlib.pyplot as plt

# Parsing Dataset

In [32]:
with open('dataset/Wiki-Vote.txt','r') as f:
    content = f.readlines()

edges_list = list(filter( lambda x: not x.startswith('#') ,content))
edges = list(map(lambda x: tuple(x.split('\t')), edges_list))
edges_tuples = list(map(lambda x: (int(x[0]), int(x[1].replace('\n',''))), edges))

In [33]:
from pyspark.sql.types import IntegerType, Row

list1, list2 = zip(*edges_tuples)
nodes = list(set(list1 + list2))
nodes_tuple = [Row(x) for x in nodes]

In [34]:
from pyspark.sql.functions import col, collect_list

#get list of nodes, with columns renamed value and id
vertices = spark.createDataFrame(nodes, IntegerType())
vertices = vertices.withColumnRenamed('value','id')

#get edges such that the src node is always smaller then the dst node
edges_n = spark.createDataFrame(edges_tuples,["src", "dst"],IntegerType())
edges_inverted = edges_n.filter(edges_n.src > edges_n.dst)
edges_normal = edges_n.filter(edges_n.src < edges_n.dst)
edges_normal2 = edges_inverted.select(col('dst').alias('src'),col('src').alias('dst'))
edges = edges_normal.union(edges_normal2).distinct()

# Query Implementation

In [35]:
e1 = edges.alias('e1')
e2 = edges.alias('e2')
e3 = edges.alias('e3')

In [36]:
e1.show()



+---+----+
|src| dst|
+---+----+
|  3| 586|
| 25| 255|
| 25| 590|
|  7|1193|
|  8| 232|
|  8| 607|
| 11| 958|
| 11|1437|
| 11|2595|
| 19|  61|
| 23| 302|
| 47|3352|
| 29|3717|
| 29|4088|
| 86|1305|
| 99|5812|
|103|2871|
|108|8283|
|109|7052|
|122|3258|
+---+----+
only showing top 20 rows



                                                                                

In [37]:
from sparkmeasure import StageMetrics

stagemetrics = StageMetrics(spark)
stagemetrics.begin()

In [38]:
from pyspark.sql.functions import col

result = e1.join(e2, col("e1.src") == col("e2.src")) \
    .join(e3, (col("e1.dst") == col("e3.src")) & (col("e2.dst") == col("e3.dst"))) \
    .select(col("e1.src").alias("node1"), col("e1.dst").alias("node2"), col("e2.dst").alias("node3")).distinct()

In [39]:
result.count()

                                                                                

608389

In [40]:
stagemetrics.end()

In [41]:
result.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[node1#289L, node2#290L, node3#291L], functions=[])
   +- HashAggregate(keys=[node1#289L, node2#290L, node3#291L], functions=[])
      +- Project [src#240L AS node1#289L, dst#241L AS node2#290L, dst#262L AS node3#291L]
         +- SortMergeJoin [dst#241L, dst#262L], [src#273L, dst#274L], Inner
            :- Sort [dst#241L ASC NULLS FIRST, dst#262L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(dst#241L, dst#262L, 200), ENSURE_REQUIREMENTS, [plan_id=5347]
            :     +- Project [src#240L, dst#241L, dst#262L]
            :        +- SortMergeJoin [src#240L], [src#261L], Inner
            :           :- Sort [src#240L ASC NULLS FIRST], false, 0
            :           :  +- Exchange hashpartitioning(src#240L, 200), ENSURE_REQUIREMENTS, [plan_id=5337]
            :           :     +- HashAggregate(keys=[src#240L, dst#241L], functions=[])
            :           :        +- Exchange 

## Measure the performances

In [42]:
stagemetrics.print_report()


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12

Aggregated Spark stage metrics:
numStages => 5
numTasks => 28
elapsedTime => 2995 (3 s)
stageDuration => 2965 (3 s)
executorRunTime => 10837 (11 s)
executorCpuTime => 3348 (3 s)
executorDeserializeTime => 80 (80 ms)
executorDeserializeCpuTime => 73 (73 ms)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 778 (0,8 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 1663 (2 s)
resultSize => 821221 (802,0 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 205586176
recordsRead => 0
bytesRead => 0 (0 Bytes)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 311068
shuffleTotalBlocksFetched => 73
shuffleLocalBlocksFetched => 73
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 4207403 (4,0 MB)
shuffleLocalBytesRead => 4207403 (4,0 MB)
shuffleRemoteBytesRead => 0 (0 Bytes)
shuffleRemoteBytesReadToDisk => 0 (0 Bytes)
shuffleBytesWritten => 1402507 (

In [44]:
stagemetrics.print_memory_report()


Additional stage-level executor metrics (memory usage info):

Stage 48 JVMHeapMemory maxVal bytes => 494025688 (471,1 MB)
Stage 48 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 50 JVMHeapMemory maxVal bytes => 494025688 (471,1 MB)
Stage 50 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 51 JVMHeapMemory maxVal bytes => 494025688 (471,1 MB)
Stage 51 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 53 JVMHeapMemory maxVal bytes => 494025688 (471,1 MB)
Stage 53 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 56 JVMHeapMemory maxVal bytes => 494025688 (471,1 MB)
Stage 56 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)


# Confront with another implementation

In [45]:
graph = GraphFrame(vertices,edges)

In [46]:
stagemetrics = StageMetrics(spark)
stagemetrics.begin()

In [47]:
triangles = graph.triangleCount()

In [48]:
triangles.show()

                                                                                

+-----+---+
|count| id|
+-----+---+
|  197| 12|
|    0| 22|
|   12| 13|
| 3143|  6|
|   23| 16|
|  280|  3|
| 1078| 20|
|   88|  5|
|  482| 19|
| 4847| 15|
|  321|  9|
|  156| 17|
|   95|  4|
| 2294|  8|
|  661| 23|
|   42|  7|
|  700| 10|
|  164| 21|
|13401| 11|
|  361| 14|
+-----+---+
only showing top 20 rows



                                                                                

In [49]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import sum

triangle_count = triangles.select(sum("count")/3)
triangle_count.select(col('(sum(count) / 3)').cast(IntegerType()).alias('count')).show()

                                                                                

+------+
| count|
+------+
|608389|
+------+



In [50]:
stagemetrics.end()

In [51]:
triangles.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [CASE WHEN isnull(count#403L) THEN 0 ELSE count#403L END AS count#413L, id#238]
   +- SortMergeJoin [id#238], [id#399], LeftOuter
      :- Sort [id#238 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(id#238, 200), ENSURE_REQUIREMENTS, [plan_id=9165]
      :     +- Project [value#236 AS id#238]
      :        +- Scan ExistingRDD[value#236]
      +- Sort [id#399 ASC NULLS FIRST], false, 0
         +- HashAggregate(keys=[id#399], functions=[count(1)])
            +- Exchange hashpartitioning(id#399, 200), ENSURE_REQUIREMENTS, [plan_id=9161]
               +- HashAggregate(keys=[id#399], functions=[partial_count(1)])
                  +- Filter isnotnull(id#399)
                     +- Generate explode(array(a#320.id, b#322.id, c#343.id)), false, [id#399]
                        +- Project [a#320, b#322, c#343]
                           +- SortMergeJoin [cast(a#320.id as bigint), cast(c#343.id as bigint)],

In [52]:
stagemetrics.print_report()


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12

Aggregated Spark stage metrics:
numStages => 19
numTasks => 109
elapsedTime => 5180 (5 s)
stageDuration => 8620 (9 s)
executorRunTime => 36159 (36 s)
executorCpuTime => 6009 (6 s)
executorDeserializeTime => 386 (0,4 s)
executorDeserializeCpuTime => 274 (0,3 s)
resultSerializationTime => 89 (89 ms)
jvmGCTime => 2723 (3 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 5688 (6 s)
resultSize => 860225 (840,1 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 196745616
recordsRead => 0
bytesRead => 0 (0 Bytes)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 458282
shuffleTotalBlocksFetched => 159
shuffleLocalBlocksFetched => 159
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 6649352 (6,3 MB)
shuffleLocalBytesRead => 6649352 (6,3 MB)
shuffleRemoteBytesRead => 0 (0 Bytes)
shuffleRemoteBytesReadToDisk => 0 (0 Bytes)
shuffleBytesWritten => 34

In [53]:
stagemetrics.print_memory_report()


Additional stage-level executor metrics (memory usage info):

Stage 57 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 57 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 58 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 58 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 59 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 59 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 61 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 61 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 62 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 62 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 64 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 64 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 66 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 66 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 69 JVMHeapMemory maxVal bytes => 313968720 (299,4 MB)
Stage 69 OnHeapExecutionMemory maxVal bytes 

# Stop session

In [69]:
spark.stop()