Helpful links for processing graphs on pyspark:

https://docs.databricks.com/spark/latest/graph-analysis/graphframes/user-guide-python.html

https://graphframes.github.io/graphframes/docs/_site/user-guide.html

https://pysparktutorial.blogspot.com/2017/10/graphframes-pyspark.html

In [1]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.functions import col, size
import pyspark.sql.functions as fn

In [2]:
# Start spark in local mode using 54gb of memory
# local mode only runs on a single node, but it will utilize all cores (We have 48!)
conf = SparkConf().setAppName("test") \
    .setMaster("local[44]") \
    .set('spark.driver.memory','54g') \
    .set('spark.jars.packages', 'graphframes:graphframes:0.7.0-spark2.4-s_2.11')
#.setMaster("yarn") # this is used when we run on hadoop, ignore for now

sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

print("Spark Version: ", sc.version)
print("defaultParallelism: ", sc.defaultParallelism)
print("Spark WebURLL ", sc.uiWebUrl) # you can view running jobs here, but I am only able to connect to it via VNC rn, maybe SSH tunneling will fix this? idk

Spark Version:  2.4.4
defaultParallelism:  44
Spark WebURLL  http://c251-117.wrangler.tacc.utexas.edu:4040


In [3]:
sc._conf.getAll() # See all the current Spark configuration settings

[('spark.jars.packages', 'graphframes:graphframes:0.7.0-spark2.4-s_2.11'),
 ('spark.app.name', 'test'),
 ('spark.driver.memory', '54g'),
 ('spark.files',
  'file:///home/06271/cju256/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,file:///home/06271/cju256/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.app.id', 'local-1572750743715'),
 ('spark.executor.id', 'driver'),
 ('spark.local.dir', '/data/06271/cju256/temp'),
 ('spark.driver.host', 'c251-117.wrangler.tacc.utexas.edu'),
 ('spark.master', 'local[44]'),
 ('spark.submit.pyFiles',
  '/home/06271/cju256/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,/home/06271/cju256/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.driver.port', '45129'),
 ('spark.rdd.compress', 'True'),
 ('spark.repl.local.jars',
  'file:///home/06271/cju256/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,file:///home/06271/cju256/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.serializer.objectStreamReset', 

In [4]:
from functools import reduce
from pyspark.sql.functions import col, lit, when
from graphframes import *

In [6]:
nodes_subset_path = '/data/06271/cju256/nodes.json'
edges_subset_path = '/data/06271/cju256/weighted_edge_list.csv'

nodes_subset = sqlContext.read.json(nodes_subset_path)
edges_subset = sqlContext.read.csv(edges_subset_path)

In [7]:
nodes_subset.printSchema()

root
 |-- about: string (nullable = true)
 |-- cancelled: boolean (nullable = true)
 |-- date_created: string (nullable = true)
 |-- email: string (nullable = true)
 |-- external_id: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_business: boolean (nullable = true)
 |-- lastname: string (nullable = true)
 |-- name: string (nullable = true)
 |-- num_friends: long (nullable = true)
 |-- phone: string (nullable = true)
 |-- picture: string (nullable = true)
 |-- username: string (nullable = true)



In [10]:
edges_subset = edges_subset.withColumnRenamed('_c0', 'src') \
                    .withColumnRenamed('_c1', 'dst') \
                    .withColumnRenamed('_c2', 'weight')

edges_subset.printSchema()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- weight: string (nullable = true)



In [11]:
from pyspark.sql.types import StringType, IntegerType, LongType

just_nodes = nodes_subset.withColumn("id_string", col('id').cast(LongType())).drop('id').withColumnRenamed("id_string",'id')
just_edges = edges_subset \
                .withColumn("src_string", col('src').cast(LongType())).drop('src').withColumnRenamed("src_string",'src') \
                .withColumn("dst_string", col('dst').cast(LongType())).drop('dst').withColumnRenamed("dst_string",'dst')

In [12]:
g = GraphFrame(just_nodes, just_edges)
print(g)

GraphFrame(v:[id: bigint, about: string ... 13 more fields], e:[src: bigint, dst: bigint ... 1 more field])


In [13]:
display(g.vertices)
display(g.edges)

DataFrame[about: string, cancelled: boolean, date_created: string, email: string, external_id: string, firstname: string, friends: string, is_business: boolean, lastname: string, name: string, num_friends: bigint, phone: string, picture: string, username: string, id: bigint]

DataFrame[weight: string, src: bigint, dst: bigint]

In [11]:
g.vertices.agg({"id": "max"}).show()

+--------+
| max(id)|
+--------+
|41493705|
+--------+



In [12]:
print("Nodes: ", g.vertices.count())
print("Edges: ", g.edges.count())

Nodes:  23133264
Edges:  342281006


In [14]:
print("Nodes: ", g.vertices.count())
print("Edges: ", g.edges.count())

Nodes:  23133264
Edges:  132514256


In [15]:
triangles = g.triangleCount()
degree_links = g.degrees.withColumn('links', col('degree') * ( col('degree') - 1))
clustering_coef = triangles.select("id", "count").join(degree_links, on='id')

In [None]:
clustering_coef = clustering_coef.withColumn("clustering_coef", col('count') / (col('links') - (2 / col('degree'))) ).persist()
clustering_coef.show()

In [None]:
clustering_coef.agg({"clustering_coef": "avg"}).show()

In [25]:
cluster_df = sqlContext.read.csv('/data/06271/cju256/clustering.csv')
cluster_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



In [26]:
cluster_df = cluster_df.withColumnRenamed('_c0', 'id') \
                .withColumnRenamed('_c1', 'tri_count') \
                .withColumnRenamed('_c2', 'degree') \
                .withColumnRenamed('_c3', 'links') \
                .withColumnRenamed('_c4', 'old_cc')

cluster_df.show()

+--------+---------+------+-----+--------------------+
|      id|tri_count|degree|links|              old_cc|
+--------+---------+------+-----+--------------------+
|10000108|        3|    15|  210| 0.02857142857142857|
|10000172|       63|    47| 2162|0.058279370952821465|
|10000304|        1|     8|   56| 0.03571428571428571|
|10000454|        4|    29|  812|0.009852216748768473|
|10000472|        1|     9|   72|0.027777777777777776|
|10000591|       33|   124|15252|0.004327301337529505|
|10000670|       38|    35| 1190| 0.06386554621848739|
|10000720|       22|   111|12210|0.003603603603603...|
|10000989|      568|   291|84390| 0.01346131058182249|
|  100010|        0|     5|   20|                 0.0|
|10001989|        3|     7|   42| 0.14285714285714285|
|10002011|       10|    44| 1892|0.010570824524312896|
|10002280|       58|   105|10920|0.010622710622710623|
| 1000240|       55|   172|29412|0.003739970080239358|
| 1000280|       88|   260|67340|0.002613602613602...|
|10002811|

In [27]:
cluster_df = cluster_df.withColumn("id_string", col('id').cast(LongType())).drop('id').withColumnRenamed("id_string",'id') \
                    .withColumn("tri_count_string", col('tri_count').cast(LongType())).drop('tri_count').withColumnRenamed("tri_count_string",'tri_count') \
                    .withColumn("degree_string", col('degree').cast(LongType())).drop('degree').withColumnRenamed("degree_string",'degree') \
                    .withColumn("links_string", col('links').cast(LongType())).drop('links').withColumnRenamed("links_string",'links') \
                    .drop('old_cc')
cluster_df.show()

+--------+---------+------+-----+
|      id|tri_count|degree|links|
+--------+---------+------+-----+
|10000108|        3|    15|  210|
|10000172|       63|    47| 2162|
|10000304|        1|     8|   56|
|10000454|        4|    29|  812|
|10000472|        1|     9|   72|
|10000591|       33|   124|15252|
|10000670|       38|    35| 1190|
|10000720|       22|   111|12210|
|10000989|      568|   291|84390|
|  100010|        0|     5|   20|
|10001989|        3|     7|   42|
|10002011|       10|    44| 1892|
|10002280|       58|   105|10920|
| 1000240|       55|   172|29412|
| 1000280|       88|   260|67340|
|10002811|        2|     3|    6|
|10003360|        2|    25|  600|
|10003366|        0|     5|   20|
|10004759|        3|    28|  756|
|10004786|        0|     1|    0|
+--------+---------+------+-----+
only showing top 20 rows



In [28]:
cluster_df = cluster_df.withColumn('degree_recip', 1/col('degree'))
cluster_df.show()

+--------+---------+------+-----+--------------------+
|      id|tri_count|degree|links|        degree_recip|
+--------+---------+------+-----+--------------------+
|10000108|        3|    15|  210| 0.06666666666666667|
|10000172|       63|    47| 2162| 0.02127659574468085|
|10000304|        1|     8|   56|               0.125|
|10000454|        4|    29|  812|0.034482758620689655|
|10000472|        1|     9|   72|  0.1111111111111111|
|10000591|       33|   124|15252|0.008064516129032258|
|10000670|       38|    35| 1190| 0.02857142857142857|
|10000720|       22|   111|12210|0.009009009009009009|
|10000989|      568|   291|84390|0.003436426116838488|
|  100010|        0|     5|   20|                 0.2|
|10001989|        3|     7|   42| 0.14285714285714285|
|10002011|       10|    44| 1892|0.022727272727272728|
|10002280|       58|   105|10920|0.009523809523809525|
| 1000240|       55|   172|29412|0.005813953488372093|
| 1000280|       88|   260|67340|0.003846153846153...|
|10002811|

In [29]:
cluster_df = cluster_df.withColumn('cc', col('tri_count') / (col('links') - 2 * col('degree_recip')))
cluster_df.show()

+--------+---------+------+-----+--------------------+--------------------+
|      id|tri_count|degree|links|        degree_recip|                  cc|
+--------+---------+------+-----+--------------------+--------------------+
|10000108|        3|    15|  210| 0.06666666666666667|0.014294790343074968|
|10000172|       63|    47| 2162| 0.02127659574468085|0.029140259024524662|
|10000304|        1|     8|   56|               0.125|0.017937219730941704|
|10000454|        4|    29|  812|0.034482758620689655|0.004926526798606982|
|10000472|        1|     9|   72|  0.1111111111111111|0.013931888544891642|
|10000591|       33|   124|15252|0.008064516129032258|0.002163652956833749|
|10000670|       38|    35| 1190| 0.02857142857142857| 0.03193430656934307|
|10000720|       22|   111|12210|0.009009009009009009|0.001801804460683...|
|10000989|      568|   291|84390|0.003436426116838488|0.006730655839066276|
|  100010|        0|     5|   20|                 0.2|                 0.0|
|10001989|  

In [30]:
cluster_df.agg({"cc": "avg"}).show()

+------------------+
|           avg(cc)|
+------------------+
|0.0222826487989539|
+------------------+



In [31]:
cluster_df.agg({"id": "max"}).show()

+--------+
| max(id)|
+--------+
|41493705|
+--------+



In [32]:
cluster_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- tri_count: long (nullable = true)
 |-- degree: long (nullable = true)
 |-- links: long (nullable = true)
 |-- degree_recip: double (nullable = true)
 |-- cc: double (nullable = true)

