In [0]:
		
from graphframes import *

# Vertex DataFrame
v = sqlContext.createDataFrame([
  (1, "Alice", 28),
  (2, "Bob", 27),
  (3, "Charlie", 65),
  (4, "David", 42),
  (5, "Ed", 55),
  (6, "Fran", 50)
], ["id", "name", "age"])

# Edge DataFrame
e = sqlContext.createDataFrame([
(2, 1, 7),     
(2, 4, 2),    
(3, 2, 4),    
(3, 6, 3),    
(4, 1, 1),    
(5, 2, 2),    
(5, 3, 8),    
(5, 6, 3),
(3, 5, 1),
], ["src", "dst", "likes"])

# Create a GraphFrame
g = GraphFrame(v, e)

In [0]:
g.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 28|
|  2|    Bob| 27|
|  3|Charlie| 65|
|  4|  David| 42|
|  5|     Ed| 55|
|  6|   Fran| 50|
+---+-------+---+



In [0]:
g.vertices.filter("age > 50").show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 65|
|  5|     Ed| 55|
+---+-------+---+



In [0]:
g.triplets.count()

Out[16]: 8

In [0]:
g.triplets.show()

+----------------+---------+----------------+
|             src|     edge|             dst|
+----------------+---------+----------------+
|  {4, David, 42}|{4, 1, 1}|  {1, Alice, 28}|
|    {2, Bob, 27}|{2, 1, 7}|  {1, Alice, 28}|
|     {5, Ed, 55}|{5, 2, 2}|    {2, Bob, 27}|
|{3, Charlie, 65}|{3, 2, 4}|    {2, Bob, 27}|
|     {5, Ed, 55}|{5, 3, 8}|{3, Charlie, 65}|
|    {2, Bob, 27}|{2, 4, 2}|  {4, David, 42}|
|     {5, Ed, 55}|{5, 6, 3}|   {6, Fran, 50}|
|{3, Charlie, 65}|{3, 6, 3}|   {6, Fran, 50}|
+----------------+---------+----------------+



In [0]:
# Q5
g.triangleCount().show()

+-----+---+-------+---+
|count| id|   name|age|
+-----+---+-------+---+
|    1|  1|  Alice| 28|
|    2|  2|    Bob| 27|
|    2|  3|Charlie| 65|
|    1|  4|  David| 42|
|    2|  5|     Ed| 55|
|    1|  6|   Fran| 50|
+-----+---+-------+---+



In [0]:
# Q6
import pyspark.sql.functions as sf


joined = g.edges.join(g.vertices, g.edges.src == g.vertices.id)
joined.show()
joined.groupBy(["name", "id"]).agg(sf.sum("likes").alias("outLikes")).orderBy(sf.desc("outLikes")).show()
type(sf.sum("likes"))
type(joined.groupBy(["name", "id"]).sum("likes"))

+---+---+-----+---+-------+---+
|src|dst|likes| id|   name|age|
+---+---+-----+---+-------+---+
|  2|  1|    7|  2|    Bob| 27|
|  2|  4|    2|  2|    Bob| 27|
|  3|  2|    4|  3|Charlie| 65|
|  3|  6|    3|  3|Charlie| 65|
|  4|  1|    1|  4|  David| 42|
|  5|  2|    2|  5|     Ed| 55|
|  5|  3|    8|  5|     Ed| 55|
|  5|  6|    3|  5|     Ed| 55|
+---+---+-----+---+-------+---+

+-------+---+--------+
|   name| id|outLikes|
+-------+---+--------+
|     Ed|  5|      13|
|    Bob|  2|       9|
|Charlie|  3|       7|
|  David|  4|       1|
+-------+---+--------+

Out[61]: pyspark.sql.dataframe.DataFrame

In [0]:
# Q7
results = g.pageRank(maxIter = 10)
results.vertices.orderBy(desc("pagerank")).show()

+---+-------+---+------------------+
| id|   name|age|          pagerank|
+---+-------+---+------------------+
|  1|  Alice| 28|1.7924127957615186|
|  2|    Bob| 27|0.9969646507526428|
|  6|   Fran| 50|0.9969646507526428|
|  4|  David| 42|0.9688717814927128|
|  3|Charlie| 65|0.6996243163176442|
|  5|     Ed| 55|0.5451618049228396|
+---+-------+---+------------------+



In [0]:
# Q8
joined = g.edges.join(g.vertices, g.edges.dst == g.vertices.id)
joined.show()
joined.groupBy("name").count().filter("name == 'Bob'").show()

+---+---+-----+---+-------+---+
|src|dst|likes| id|   name|age|
+---+---+-----+---+-------+---+
|  2|  1|    7|  1|  Alice| 28|
|  4|  1|    1|  1|  Alice| 28|
|  3|  2|    4|  2|    Bob| 27|
|  5|  2|    2|  2|    Bob| 27|
|  5|  3|    8|  3|Charlie| 65|
|  2|  4|    2|  4|  David| 42|
|  3|  6|    3|  6|   Fran| 50|
|  5|  6|    3|  6|   Fran| 50|
+---+---+-----+---+-------+---+

+----+-----+
|name|count|
+----+-----+
| Bob|    2|
+----+-----+



In [0]:
joined = g.edges.join(g.vertices, g.edges.dst == g.vertices.id)
joined.groupBy("name").sum("likes").show()

+-------+----------+
|   name|sum(likes)|
+-------+----------+
|Charlie|         8|
|    Bob|         6|
|  Alice|         8|
|  David|         2|
|   Fran|         6|
+-------+----------+



In [0]:
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()

+----------------+---------+----------------+---------+
|               a|        e|               b|       e2|
+----------------+---------+----------------+---------+
|{3, Charlie, 65}|{3, 5, 1}|     {5, Ed, 55}|{5, 3, 8}|
|     {5, Ed, 55}|{5, 3, 8}|{3, Charlie, 65}|{3, 5, 1}|
+----------------+---------+----------------+---------+



In [0]:
# Q10
results = g.shortestPaths(landmarks=["1", "2"])
results.select("id", "distances").show()

+---+----------------+
| id|       distances|
+---+----------------+
|  1|        {1 -> 0}|
|  2|{2 -> 0, 1 -> 1}|
|  3|{2 -> 1, 1 -> 2}|
|  4|        {1 -> 1}|
|  5|{2 -> 1, 1 -> 2}|
|  6|              {}|
+---+----------------+



In [0]:
results = g.shortestPaths(to=["1"]) 
results.select("id", "path").show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-479980166246984>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mresults[0m [0;34m=[0m [0mg[0m[0;34m.[0m[0mshortestPaths[0m[0;34m([0m[0mto[0m[0;34m=[0m[0;34m[[0m[0;34m"1"[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mresults[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0;34m"id"[0m[0;34m,[0m [0;34m"path"[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mTypeError[0m: shortestPaths() got an unexpected keyword argument 'to'