## Exploring GraphX

In [31]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from graphframes import *

In [32]:
# Vertex DataFrame; contains identifier field "id"
v = spark.createDataFrame([
  ("1", "Adam", "koala"),
  ("2", "Callie", "flamingo"),
  ("3", "Elle", "panda"),
  ("4", "Jacqui", "fox")
], ["id", "name", "favorite_animal"])

# Edge DataFrame; contains source field "src" and destination field "dst"
e = spark.createDataFrame([
  ("1", "2", "dad"),
  ("1", "3", "husband"),
  ("1", "4", "son_in_law"),
  ("2", "1", "daughter"),
  ("2", "3", "daughter"),
  ("2", "4", "granddaughter"),
  ("3", "1", "wife"),
  ("3", "2", "mom"),
  ("3", "4", "daughter"),
  ("4", "1", "mother_in_law"),
  ("4", "2", "grandmother"),
  ("4", "3", "mom")
], ["src", "dst", "relationship"])

In [33]:
g = GraphFrame(v, e)

In [34]:
g.vertices.show()

+---+------+---------------+
| id|  name|favorite_animal|
+---+------+---------------+
|  1|  Adam|          koala|
|  2|Callie|       flamingo|
|  3|  Elle|          panda|
|  4|Jacqui|            fox|
+---+------+---------------+



In [35]:
# number of grandmother relationships in the graph
numGmother = g.edges.filter("relationship = 'grandmother'").count()
print(numGmother)

1


In [36]:
results = g.pageRank(resetProbability=0.25, maxIter=20)
vertices = results.vertices

In [37]:
vertices.show()

+---+------+---------------+--------+
| id|  name|favorite_animal|pagerank|
+---+------+---------------+--------+
|  1|  Adam|          koala|     1.0|
|  3|  Elle|          panda|     1.0|
|  2|Callie|       flamingo|     1.0|
|  4|Jacqui|            fox|     1.0|
+---+------+---------------+--------+



In [38]:
# building a new graph from the original dataframes to produce pagerank values
# that are not all the same, shown in the `results.pagerank` field

# vertex/edge data
# identifier field: "id"
v = spark.createDataFrame([
  ("1", "Adam", "koala"),
  ("2", "Callie", "flamingo"),
  ("3", "Elle", "panda"),
  ("4", "Jacqui", "fox")
], ["id", "name", "favorite_animal"])

# source field: "src"
# destination field: "dst"
e = spark.createDataFrame([
  ("1", "2", "dad"),
  ("1", "2", "husband"),
  ("1", "2", "son_in_law"),
  ("2", "1", "daughter"),
  ("2", "1", "daughter"),
  ("2", "1", "granddaughter"),
  ("3", "3", "wife"),
  ("3", "2", "mom"),
  ("3", "2", "mom"),
  ("4", "1", "mother_in_law"),
  ("4", "2", "mom"),
  ("4", "2", "mom")
], ["src", "dst", "relationship"])

g = GraphFrame(v, e)

In [39]:
results = g.pageRank(resetProbability=0.25, maxIter=20)
vertices = results.vertices

In [40]:
vertices.show()

+---+------+---------------+-------------------+
| id|  name|favorite_animal|           pagerank|
+---+------+---------------+-------------------+
|  1|  Adam|          koala| 1.6439897185491648|
|  3|  Elle|          panda| 0.3333333333339397|
|  2|Callie|       flamingo| 1.7726769481168958|
|  4|Jacqui|            fox|0.25000000000000006|
+---+------+---------------+-------------------+



Yes, these results do make sense. I intentionally made verticies 1 and 2 important by attaching many different edges to them, while barely attaching any followers to 3 and 4. Therefore, the pageranks are higher for the first two vertices, and lower for the latter two.

In [None]:
By attaching many different edges to vertices 1 and 2, I made them more important. By attaching hardly any to 