In [1]:
import graphframes

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = (SparkSession.builder
    .master("local[*]")
    .config("spark.driver.cores", 1)
    .config("spark.jars.packages", "graphframes:graphframes:0.6.0-spark2.3-s_2.11")
    .appName("Graphframes test")
    .getOrCreate() )
sc = spark.sparkContext
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7fb241725160>
<SparkContext master=local[*] appName=Graphframes test>


In [3]:
sc.getConf().getAll()

[('spark.jars',
  'file:///home/jovyan/.ivy2/jars/graphframes_graphframes-0.6.0-spark2.3-s_2.11.jar,file:///home/jovyan/.ivy2/jars/com.typesafe.scala-logging_scala-logging-api_2.11-2.1.2.jar,file:///home/jovyan/.ivy2/jars/com.typesafe.scala-logging_scala-logging-slf4j_2.11-2.1.2.jar,file:///home/jovyan/.ivy2/jars/org.scala-lang_scala-reflect-2.11.0.jar,file:///home/jovyan/.ivy2/jars/org.slf4j_slf4j-api-1.7.7.jar'),
 ('spark.executor.id', 'driver'),
 ('spark.jars.packages', 'graphframes:graphframes:0.6.0-spark2.3-s_2.11'),
 ('spark.submit.pyFiles',
  '/home/jovyan/.ivy2/jars/graphframes_graphframes-0.6.0-spark2.3-s_2.11.jar,/home/jovyan/.ivy2/jars/com.typesafe.scala-logging_scala-logging-api_2.11-2.1.2.jar,/home/jovyan/.ivy2/jars/com.typesafe.scala-logging_scala-logging-slf4j_2.11-2.1.2.jar,/home/jovyan/.ivy2/jars/org.scala-lang_scala-reflect-2.11.0.jar,/home/jovyan/.ivy2/jars/org.slf4j_slf4j-api-1.7.7.jar'),
 ('spark.app.name', 'Graphframes test'),
 ('spark.driver.host', 'a1f7a7315a08'

In [4]:
# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])

In [5]:
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])


In [6]:
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

In [7]:
# Query: Get in-degree of each vertex.
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



In [8]:
# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

2

In [9]:
# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|  b|1.0905890109440908|
|  a|              0.01|
|  c|1.8994109890559092|
+---+------------------+

