In [2]:
# Create a Vertex DataFrame with unique ID column "id"
vertices = sqlContext.createDataFrame([
  ("1", "Jiangnan Song", 18),
  ("2", "Juan Xu", 18),
  ("4", "Donny Norlander", 20),
  ("3", "Julius Yang", 20),
  ("6", "Stuti Jain", 18),
  ("5", "Kris Larsen", 18),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
edges = sqlContext.createDataFrame([
  ("1", "3", "follow"),
  ("1", "6", "follow"),
  ("2", "6", "follow"),
  ("2", "5", "follow"),
  ("3", "2", "follow"),
  ("3", "4", "follow"),
  ("4", "1", "follow"),
  ("5", "6", "follow"),
  ("5", "3", "follow"),
  ("6", "5", "follow"),
], ["src", "dst", "relationship"])

# Create a GraphFrame
#from graphframes import *
#g = GraphFrame(v, e)



In [3]:
# Create a GraphFrame
from graphframes import *
g = GraphFrame(vertices, edges)

In [5]:
g.vertices.toPandas()


Unnamed: 0,id,name,age
0,1,Jiangnan Song,18
1,2,Juan Xu,18
2,4,Donny Norlander,20
3,3,Julius Yang,20
4,6,Stuti Jain,18
5,5,Kris Larsen,18


In [6]:
g.edges.toPandas()

Unnamed: 0,src,dst,relationship
0,1,3,follow
1,1,6,follow
2,2,6,follow
3,2,5,follow
4,3,2,follow
5,3,4,follow
6,4,1,follow
7,5,6,follow
8,5,3,follow
9,6,5,follow


# In Degree

In [9]:
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  3|       2|
|  5|       2|
|  6|       3|
|  1|       1|
|  4|       1|
|  2|       1|
+---+--------+



# Shortest Path

In [35]:
# shortest Path
results = g.shortestPaths(landmarks=["1","2","4"])
results.select("id", 'name',"distances").toPandas()

Unnamed: 0,id,name,distances
0,1,Jiangnan Song,"{'4': 2, '1': 0, '2': 2}"
1,3,Julius Yang,"{'4': 1, '1': 2, '2': 1}"
2,2,Juan Xu,"{'4': 3, '1': 4, '2': 0}"
3,4,Donny Norlander,"{'4': 0, '1': 1, '2': 3}"
4,6,Stuti Jain,"{'4': 3, '1': 4, '2': 3}"
5,5,Kris Larsen,"{'4': 2, '1': 3, '2': 2}"


# Triangle Count

In [11]:
g.triangleCount().toPandas()

Unnamed: 0,count,id,name,age
0,2,3,Julius Yang,20
1,2,5,Ken Reily,30
2,1,6,De Liu,30
3,1,1,Jiangnan Song,18
4,1,4,Donny Norlander,20
5,2,2,Juan Xu,18


# PageRank

In [3]:
# Run PageRank until convergence to tolerance "tol".
results = g.pageRank(resetProbability=0.15, tol=0.01)
# Display resulting pageranks and final edge weights
# Note that the displayed pagerank may be truncated, e.g., missing the E notation.
# In Spark 1.5+, you can use show(truncate=False) to avoid truncation.
results.vertices.select("id", 'name',"pagerank").toPandas()
#results.edges.select("src", "dst", "weight").toPandas()

Unnamed: 0,id,name,pagerank
0,1,Jiangnan Song,0.678428
1,3,Julius Yang,1.112098
2,2,Juan Xu,0.630664
3,4,Donny Norlander,0.630664
4,6,Stuti Jain,1.37048
5,5,Kris Larsen,1.577666


# Motifs Finding

This is how you discover structures in the graph. The two patterns will find:

1.\ vertices a and b, connected by e
2.\ vertices b and c, connected by e2
Essentially,

In [32]:
# Motif: A->B->C but not A->C or C->A 
results = g.find("(A)-[]->(B); (B)-[]->(C); !(A)-[]->(C); !(C)-[]->(A)") 
# Filter out loops (with DataFrame operation) 
results = results.filter("A.id != C.id") 
# Select recommendations for A to follow C 
results = results.select("A", "C") 
results.toPandas()

Unnamed: 0,A,C
0,"(6, Stuti Jain, 18)","(3, Julius Yang, 20)"
1,"(5, Kris Larsen, 18)","(4, Donny Norlander, 20)"
2,"(1, Jiangnan Song, 18)","(5, Kris Larsen, 18)"
3,"(1, Jiangnan Song, 18)","(2, Juan Xu, 18)"
4,"(3, Julius Yang, 20)","(6, Stuti Jain, 18)"
5,"(4, Donny Norlander, 20)","(6, Stuti Jain, 18)"


# Strongly Connected Components

In [41]:
result = g.stronglyConnectedComponents(maxIter = 2)
result.select("id", "name","component").orderBy("component").toPandas()

Unnamed: 0,id,name,component
0,1,Jiangnan Song,154618822656
1,3,Julius Yang,154618822656
2,2,Juan Xu,154618822656
3,4,Donny Norlander,154618822656
4,6,Stuti Jain,154618822656
5,5,Kris Larsen,154618822656
