In [None]:
# --Analyzing Social Networks using GraphX/GraphFrame--

In [None]:
from graphframes import GraphFrame
from pyspark.sql.functions import desc

#Bitcoin Alpha trust social network dataset source: https://snap.stanford.edu/data/soc-sign-bitcoin-alpha.html
# This dataset is containts source, target, rating, time

# Loading edge data from the CSV file into a Spark DataFrame
edges_df = spark.read.csv("dbfs:/FileStore/soc_sign_bitcoinalpha.csv", header=False, inferSchema=True) # Loading the data
edges_df = edges_df.selectExpr("_c0 as src", "_c1 as dst") # Renaming the columns as its required by graphframe later
edges_df.show(10) # Displaying the first 10 rows

+----+---+
| src|dst|
+----+---+
|7188|  1|
| 430|  1|
|3134|  1|
|3026|  1|
|3010|  1|
| 804|  1|
| 160|  1|
|  95|  1|
| 377|  1|
| 888|  1|
+----+---+
only showing top 10 rows



In [None]:
# Creating the vertices dataframe by combining the source and target nodes
vertices_df = edges_df.selectExpr("src as id").union(edges_df.selectExpr("dst as id")).distinct()
vertices_df.show(10)  # Displaying the first 10 rows

+----+
|  id|
+----+
| 804|
|3026|
|7188|
|3010|
| 377|
| 430|
| 160|
|  95|
| 888|
|3134|
+----+
only showing top 10 rows



In [None]:
# Creating a GraphFrame using the vertices and edges DataFrames
graph = GraphFrame(vertices_df, edges_df)



In [None]:
# Put the graph in memory for faster access
graph.cache()

Out[5]: GraphFrame(v:[id: int], e:[src: int, dst: int])

In [None]:
# Running Queries:
# Run the following queries using the GraphX/GraphFrame API and write your output to a file on the cluster.

In [None]:
# a. Find the top 5 nodes with the highest outdegree and find the count of the number of outgoing edges in each

# We get the outdegrees from the graph, we sort the nodes with them in descending order and grab top 5 nodes
top5_out_degree = graph.outDegrees.sort(desc("outDegree")).limit(5)
top5_out_degree.show() # display the top 5 nodes with highest outdegree



+---+---------+
| id|outDegree|
+---+---------+
|  1|      490|
|  8|      259|
|  3|      243|
|  4|      215|
|  7|      212|
+---+---------+



In [None]:
# b. Find the top 5 nodes with the highest indegree and find the count of the number of incoming edges in each

# We get the indegrees from the graph, we sort the nodes with the indegrees in descending order and grab the top 5 nodes
top5_in_degree = graph.inDegrees.sort(desc("inDegree")).limit(5)
top5_in_degree.show() # display the top 5 nodes with the highest indegrees

+---+--------+
| id|inDegree|
+---+--------+
|  1|     398|
|  3|     251|
|  2|     205|
| 11|     203|
|  4|     201|
+---+--------+



In [None]:
# c. Calculate PageRank for each of the nodes and output the top 5 nodes with the highest PageRank values. You are free to define any suitable parameters.

# First we run the PageRank algorithm algorithm on the graph with the specific parameters from the class lab
page_rank_results = graph.pageRank(resetProbability=0.15, maxIter=10)

# Then we order in descending order and get the top 5 nodes (these will be the vertices with the highest pagerank values)
top5_page_rank = page_rank_results.vertices.orderBy(desc("pagerank")).limit(5)
top5_page_rank.show() # display the top 5 nodes with the highest pagerank values

+---+------------------+
| id|          pagerank|
+---+------------------+
|  1|  64.0285169308339|
|  3|33.939965394258394|
|  4|30.478870193560947|
|  2|25.205056808837032|
|177| 25.12373691609342|
+---+------------------+



In [None]:
# d. Run the connected components algorithm on it and find the top 5 components with the largest number of nodes.

# First we set the checkpoint directory (this is required for the connected components algorithm, we do this like we did in class)
sc.setCheckpointDir("/tmp/checkpoints")

# Next, we will get the connected components
connected_components = graph.connectedComponents()

# Now, we will group the nodes by the component and count the number of nodes in each component
grouped_components_count = connected_components.groupBy("component").count()

# After that we will sort
top5_components = grouped_components_count.orderBy(desc("count")).limit(5)
top5_components.show() # display the top 5 components with largest number of nodes

+---------+-----+
|component|count|
+---------+-----+
|        1| 3775|
|     3228|    2|
|     1389|    2|
|     5837|    2|
|     1870|    2|
+---------+-----+



In [None]:
# e. Run the triangle counts algorithm on each of the vertices and output the top 5 vertices with the largest triangle count. In case of ties, you can randomly select the top 5 vertices.

# First we run the triangle counts algorithm on the graph
triangles_count = graph.triangleCount()

# Then we sort in descending order and grab the first 5 (which are the top 5 vertices with largest triangle count)
top5_triangles = triangles_count.orderBy(desc("count")).limit(5)
top5_triangles.show() # Display the top 5 vertices with the largest triangle count

+-----+---+
|count| id|
+-----+---+
| 1815| 11|
| 1628|  2|
| 1414|177|
| 1336|  3|
| 1181|  7|
+-----+---+

