Helpful links for processing graphs on pyspark:

https://docs.databricks.com/spark/latest/graph-analysis/graphframes/user-guide-python.html

https://graphframes.github.io/graphframes/docs/_site/user-guide.html

https://pysparktutorial.blogspot.com/2017/10/graphframes-pyspark.html

In [1]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.functions import col, size, lit
import pyspark.sql.functions as F
from graphframes import *

In [2]:
# Start spark in local mode using 54gb of memory
# local mode only runs on a single node, but it will utilize all cores (We have 48!)
conf = SparkConf().setAppName("test") \
    .setMaster("local[44]") \
    .set('spark.driver.memory','54g') \
    .set('spark.jars.packages', 'graphframes:graphframes:0.7.0-spark2.4-s_2.11')
#.setMaster("yarn") # this is used when we run on hadoop, ignore for now

sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

print("Spark Version: ", sc.version)
print("defaultParallelism: ", sc.defaultParallelism)
print("Spark WebURLL ", sc.uiWebUrl) # you can view running jobs here, but I am only able to connect to it via VNC rn, maybe SSH tunneling will fix this? idk

Spark Version:  2.4.4
defaultParallelism:  44
Spark WebURLL  http://c251-117.wrangler.tacc.utexas.edu:4040


In [3]:
sc._conf.getAll() # See all the current Spark configuration settings

[('spark.jars.packages', 'graphframes:graphframes:0.7.0-spark2.4-s_2.11'),
 ('spark.app.name', 'test'),
 ('spark.driver.memory', '54g'),
 ('spark.files',
  'file:///home/06271/cju256/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,file:///home/06271/cju256/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.app.id', 'local-1572750743715'),
 ('spark.executor.id', 'driver'),
 ('spark.local.dir', '/data/06271/cju256/temp'),
 ('spark.driver.host', 'c251-117.wrangler.tacc.utexas.edu'),
 ('spark.master', 'local[44]'),
 ('spark.submit.pyFiles',
  '/home/06271/cju256/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,/home/06271/cju256/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.driver.port', '45129'),
 ('spark.rdd.compress', 'True'),
 ('spark.repl.local.jars',
  'file:///home/06271/cju256/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,file:///home/06271/cju256/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.serializer.objectStreamReset', 

In [6]:
nodes_path = '/data/06271/cju256/nodes.json'
edges_path = '/data/06271/cju256/edges.json'

nodes = sqlContext.read.json(nodes_path)
edges = sqlContext.read.json(edges_path)

In [7]:
nodes.printSchema()

root
 |-- about: string (nullable = true)
 |-- cancelled: boolean (nullable = true)
 |-- date_created: string (nullable = true)
 |-- email: string (nullable = true)
 |-- external_id: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_business: boolean (nullable = true)
 |-- lastname: string (nullable = true)
 |-- name: string (nullable = true)
 |-- num_friends: long (nullable = true)
 |-- phone: string (nullable = true)
 |-- picture: string (nullable = true)
 |-- username: string (nullable = true)



In [10]:
edges.printSchema()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- weight: string (nullable = true)



In [11]:
from pyspark.sql.types import StringType, IntegerType, LongType

just_nodes = nodes.withColumn("id_string", col('id').cast(LongType())).drop('id').withColumnRenamed("id_string",'id')
just_edges = edges \
                .withColumn("src_string", col('src').cast(LongType())).drop('src').withColumnRenamed("src_string",'src') \
                .withColumn("dst_string", col('dst').cast(LongType())).drop('dst').withColumnRenamed("dst_string",'dst')

In [12]:
g = GraphFrame(just_nodes, just_edges)
print(g)

GraphFrame(v:[id: bigint, about: string ... 13 more fields], e:[src: bigint, dst: bigint ... 1 more field])


In [13]:
display(g.vertices)
display(g.edges)

DataFrame[about: string, cancelled: boolean, date_created: string, email: string, external_id: string, firstname: string, friends: string, is_business: boolean, lastname: string, name: string, num_friends: bigint, phone: string, picture: string, username: string, id: bigint]

DataFrame[weight: string, src: bigint, dst: bigint]

In [11]:
g.vertices.agg({"id": "max"}).show()

+--------+
| max(id)|
+--------+
|41493705|
+--------+



In [12]:
print("Nodes: ", g.vertices.count())
print("Edges: ", g.edges.count())

Nodes:  23133264
Edges:  342281006


In [14]:
print("Nodes: ", g.vertices.count())
print("Edges: ", g.edges.count())

Nodes:  23133264
Edges:  132514256


In [None]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
# More complex queries can be expressed by applying filters.
#motifs.filter("a.id != c.id").show()
motifs.count()

In [None]:
#motifs.filter("a.id != c.id").count()