# SetUp

In [1]:
import os
import sys
import glob
from os.path import abspath
os.environ['SPARK_HOME'] = '/home/chiara/Documenti/BigData/CountingTriangles/spark-3.5.0-bin-hadoop3'
os.environ['HADOOP_HOME'] = '/home/chiara/Documenti/BigData/CountingTriangles/spark-3.5.0-bin-hadoop3'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-17-openjdk-amd64'
os.environ['SPARK_LOCAL_IP'] = '172.17.0.1'

In [2]:
spark_python = os.path.join(os.environ.get('SPARK_HOME',None),'python')
py4j = glob.glob(os.path.join(spark_python,'lib','py4j-*.zip'))[0]
graphf = glob.glob(os.path.join(spark_python,'graphframes.zip'))[0]
sparkmeasure = glob.glob(os.path.join(spark_python,'sparkmeasure.zip'))[0]
sys.path[:0]=[spark_python,py4j]
sys.path[:0]=[spark_python,graphf]
sys.path[:0]=[spark_python, sparkmeasure]
os.environ['PYTHONPATH']=py4j+os.pathsep+graphf+sparkmeasure

In [3]:
import findspark
findspark.init()
findspark.find()

'/home/chiara/Documenti/BigData/CountingTriangles/spark-3.5.0-bin-hadoop3'

In [4]:
#from pyspark.sql import SparkSession

#spark = SparkSession.builder.appName("Counting Triangles").set("spark.driver.memory", "8g").enableHiveSupport().getOrCreate()

In [5]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName('Counting Triangles')
    .config('spark.driver.extraClassPath', '/usr/local/bin/postgresql-42.2.5.jar')
    .config('spark.executor.memory', '8g')
    .config("spark.driver.memory", "8g")
    .config('spark.memory.offHeap.enabled', True)
    .config('spark.memory.offHeap.size', '20g') 
    .enableHiveSupport()
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/23 10:33:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
from graphframes import *
import networkx as nx
import matplotlib.pyplot as plt

# Parsing Dataset

In [7]:
dataset = 'datasets/soc-Epinions1.txt'
with open(dataset,'r') as f:
    content = f.readlines()
edges_list = list(filter( lambda x: not x.startswith('#') ,content))


if 'facebook' in dataset:
    edges = list(map(lambda x: tuple(x.split(' ')), edges_list))
else:
    edges = list(map(lambda x: tuple(x.split('\t')), edges_list))

edges_tuples = list(map(lambda x: (int(x[0]), int(x[1].replace('\n',''))), edges))

In [8]:
from pyspark.sql.types import IntegerType, Row

list1, list2 = zip(*edges_tuples)
nodes = list(set(list1 + list2))
nodes_tuple = [Row(x) for x in nodes]

In [9]:
from pyspark.sql.functions import col, collect_list

#get list of nodes, with columns renamed value and id
vertices = spark.createDataFrame(nodes, IntegerType())
vertices = vertices.withColumnRenamed('value','id')

#get edges such that the src node is always smaller then the dst node
edges_n = spark.createDataFrame(edges_tuples,["src", "dst"],IntegerType())
edges_inverted = edges_n.filter(edges_n.src > edges_n.dst)
edges_normal = edges_n.filter(edges_n.src < edges_n.dst)
edges_normal2 = edges_inverted.select(col('dst').alias('src'),col('src').alias('dst'))
edges = edges_normal.union(edges_normal2).distinct()

### Check we read the right data

In [10]:
vertices.count()

                                                                                

75879

For dataset soc-Epinions1 the found edges are not the same length as referred in the dataset page, this is because of the distinct() operation at the end of the selection of edges: we treat each graph as undirected and only consider edges where srcId < dstId, so when we encounter two edges of the kind (srcId, dstId) (dstId, srcId), with srcId < dstId or viceversa, we're going to keep just one edge.

In [11]:
edges.count()

                                                                                

405740

# Query Implementation

In [12]:
e1 = edges.alias('e1')
e2 = edges.alias('e2')
e3 = edges.alias('e3')

In [13]:
e1.show()

                                                                                

+---+----+
|src| dst|
+---+----+
|  4|  19|
|  5| 443|
| 11|  97|
| 11| 329|
| 12| 382|
| 12| 386|
| 12| 780|
| 15|  18|
| 17|1777|
| 19|2030|
| 22|  25|
| 25|2461|
| 28| 130|
| 28|1512|
| 28|2198|
| 31|1343|
| 31|2491|
| 34| 185|
| 34|1171|
| 34|1370|
+---+----+
only showing top 20 rows



In [14]:
from sparkmeasure import StageMetrics

stagemetrics = StageMetrics(spark)
stagemetrics.begin()

In [15]:
from pyspark.sql.functions import col

result = e1.join(e2, col("e1.src") == col("e2.src")) \
    .join(e3, (col("e1.dst") == col("e3.src")) & (col("e2.dst") == col("e3.dst"))) \
    .select(col("e1.src").alias("node1"), col("e1.dst").alias("node2"), col("e2.dst").alias("node3")).distinct()

In [16]:
result.count()

                                                                                

1624481

In [17]:
stagemetrics.end()

In [18]:
result.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[node1#66L, node2#67L, node3#68L], functions=[])
   +- HashAggregate(keys=[node1#66L, node2#67L, node3#68L], functions=[])
      +- Project [src#4L AS node1#66L, dst#5L AS node2#67L, dst#39L AS node3#68L]
         +- SortMergeJoin [dst#5L, dst#39L], [src#50L, dst#51L], Inner
            :- Sort [dst#5L ASC NULLS FIRST, dst#39L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(dst#5L, dst#39L, 200), ENSURE_REQUIREMENTS, [plan_id=1168]
            :     +- Project [src#4L, dst#5L, dst#39L]
            :        +- SortMergeJoin [src#4L], [src#38L], Inner
            :           :- Sort [src#4L ASC NULLS FIRST], false, 0
            :           :  +- Exchange hashpartitioning(src#4L, 200), ENSURE_REQUIREMENTS, [plan_id=1158]
            :           :     +- HashAggregate(keys=[src#4L, dst#5L], functions=[])
            :           :        +- Exchange hashpartitioning(src#4L, dst#5L, 200), EN

## Measure the performances

In [19]:
stagemetrics.print_report()


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12

Aggregated Spark stage metrics:
numStages => 6
numTasks => 53
elapsedTime => 41589 (42 s)
stageDuration => 40958 (41 s)
executorRunTime => 252319 (4,2 min)
executorCpuTime => 232442 (3,9 min)
executorDeserializeTime => 413 (0,4 s)
executorDeserializeCpuTime => 280 (0,3 s)
resultSerializationTime => 4 (4 ms)
jvmGCTime => 838 (0,8 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 4124 (4 s)
resultSize => 3694599 (3,5 MB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 14706526584
recordsRead => 0
bytesRead => 0 (0 Bytes)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 91283228
shuffleTotalBlocksFetched => 604
shuffleLocalBlocksFetched => 604
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 936325617 (892,9 MB)
shuffleLocalBytesRead => 936325617 (892,9 MB)
shuffleRemoteBytesRead => 0 (0 Bytes)
shuffleRemoteBytesReadToDisk => 0 (0 Bytes)
shu

In [21]:
stagemetrics.print_memory_report()


Additional stage-level executor metrics (memory usage info):

Stage 12 JVMHeapMemory maxVal bytes => 1006347088 (959,7 MB)
Stage 12 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 14 JVMHeapMemory maxVal bytes => 1006347088 (959,7 MB)
Stage 14 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 17 JVMHeapMemory maxVal bytes => 1006347088 (959,7 MB)
Stage 17 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 20 JVMHeapMemory maxVal bytes => 1113126536 (1061,6 MB)
Stage 20 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 24 JVMHeapMemory maxVal bytes => 1113126536 (1061,6 MB)
Stage 24 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 29 JVMHeapMemory maxVal bytes => 565638064 (539,4 MB)
Stage 29 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)


# Confront with another implementation

In [22]:
graph = GraphFrame(vertices,edges)

In [23]:
stagemetrics = StageMetrics(spark)
stagemetrics.begin()

In [24]:
triangles = graph.triangleCount()

In [25]:
triangles.show()



+-----+---+
|count| id|
+-----+---+
|10229| 12|
|16511|  1|
| 2572| 13|
| 1971|  6|
|  969| 16|
|  302|  3|
| 2260| 20|
| 4854|  5|
|10770| 19|
|  238| 15|
|   83|  9|
|  234| 17|
| 2716|  4|
|  612|  8|
|  107|  7|
| 5865| 10|
| 5729| 11|
| 1222| 14|
| 3885|  2|
|12462|  0|
+-----+---+
only showing top 20 rows



                                                                                

In [26]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import sum

triangle_count = triangles.select(sum("count")/3)
triangle_count.select(col('(sum(count) / 3)').cast(IntegerType()).alias('count')).show()



+-------+
|  count|
+-------+
|1624481|
+-------+



                                                                                

In [27]:
stagemetrics.end()

In [28]:
triangles.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [CASE WHEN isnull(count#180L) THEN 0 ELSE count#180L END AS count#190L, id#2]
   +- SortMergeJoin [id#2], [id#176], LeftOuter
      :- Sort [id#2 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(id#2, 200), ENSURE_REQUIREMENTS, [plan_id=5573]
      :     +- Project [value#0 AS id#2]
      :        +- Scan ExistingRDD[value#0]
      +- Sort [id#176 ASC NULLS FIRST], false, 0
         +- HashAggregate(keys=[id#176], functions=[count(1)])
            +- Exchange hashpartitioning(id#176, 200), ENSURE_REQUIREMENTS, [plan_id=5569]
               +- HashAggregate(keys=[id#176], functions=[partial_count(1)])
                  +- Filter isnotnull(id#176)
                     +- Generate explode(array(a#97.id, b#99.id, c#120.id)), false, [id#176]
                        +- Project [a#97, b#99, c#120]
                           +- SortMergeJoin [cast(a#97.id as bigint), cast(c#120.id as bigint)], [__tmp-10438860910

In [29]:
stagemetrics.print_report()


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12

Aggregated Spark stage metrics:
numStages => 23
numTasks => 177
elapsedTime => 26516 (27 s)
stageDuration => 31733 (32 s)
executorRunTime => 239041 (4,0 min)
executorCpuTime => 173399 (2,9 min)
executorDeserializeTime => 955 (1,0 s)
executorDeserializeCpuTime => 611 (0,6 s)
resultSerializationTime => 34 (34 ms)
jvmGCTime => 1993 (2 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 11638 (12 s)
resultSize => 395143 (385,9 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 15923869408
recordsRead => 0
bytesRead => 0 (0 Bytes)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 49117580
shuffleTotalBlocksFetched => 1053
shuffleLocalBlocksFetched => 1053
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 577715523 (551,0 MB)
shuffleLocalBytesRead => 577715523 (551,0 MB)
shuffleRemoteBytesRead => 0 (0 Bytes)
shuffleRemoteBytesReadToDisk => 0 (0 By

In [33]:
stagemetrics.print_memory_report()


Additional stage-level executor metrics (memory usage info):

Stage 30 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 30 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 31 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 31 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 32 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 32 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 34 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 34 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 35 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 35 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 37 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 37 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 40 JVMHeapMemory maxVal bytes => 1018885408 (971,7 MB)
Stage 40 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes)
Stage 44 JVMHeapMemory maxVal bytes => 1092642840 (1042,0 MB)
Stage 44 OnHeapExecutionMemory maxV

# Stop session

In [34]:
spark.stop()