# Graph Analytics

This notebook has been created to test graph analytics capabilities of GraphFrame on a sample dataset.

In [1]:
# Import libraries 
import graphframe as GF
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col

In [2]:
# Create spark context
sparkConf = (SparkConf().setMaster("local").setAppName("SocialGraph").set("spark.executor.memory", "2g").set("spark.executor.instances", "4"))
sparkContext = SparkContext(conf=sparkConf)
sql_context = SQLContext(sparkContext)

In [3]:
# Create Spark dataframe for group members data
df_group_members = sql_context.read.format('com.databricks.spark.csv').options(header='true').load('data/group_members.csv')

In [None]:
df_group_members.show(5)

In [4]:
df_group_members.count()

20917620

In [None]:
# Select vertices column from Spark dataframe
df_users = df_group_members.select(['user_id'])
df_users = df_users.selectExpr("user_id as id") 

In [None]:
# Remove duplicate user_id entries and create vertices dataframe
vertices=df_users.drop_duplicates()

In [None]:
vertices.take(5)

In [None]:
# Create the edges dataframe
edges = df_group_members.select(col('user_id').alias('src'),col('group_id')).join(df_group_members.select(col('user_id').alias('dst'),col('group_id')), on=['group_id'], how='outer')
edges = edges.select(col('src'),col('dst'),col('group_id')).filter(edges.src != edges.dst)

In [None]:
edges.take(5)

In [None]:
# Generate the graph
graph = GF.GraphFrame(vertices, edges)

In [None]:
del vertices, edges, df_users

In [None]:
# Save the graph to file
#graph.vertices.write.parquet('store/gv.parquet')
#graph.edges.write.parquet('store/ge.parquet')

In [None]:
# Read the graph from file
#vertices = spark.read.parquet('store/gv.parquet')
#edges = spark.read.parquet('store/ge.parquet')

#graph = GraphFrame(vertices, edges)
#graph.vertices.show(5)
#graph.edges.show(5)

In [None]:
'''
This function return the first connects of a given vertex as a dataframe
'''
def first_connects(vertex):
    first_connect_motifs = g.find("(v1)-[e]->(v2)").filter("v1.id == '"+vertex+"'")
    return first_connect_motifs.select("v2.id","e")

In [None]:
# Fetch 1st connects and 2nd connects
graph_connection = graph.stronglyConnectedComponents(maxIter=1)
graph_connection.select("id", "component").orderBy("component").show()

In [5]:
# Create Spark dataframe for group to channel mapping data
df_group_channels = sql_context.read.format('com.databricks.spark.csv').options(header='true').load('data/group_channel.csv')

In [6]:
df_group_channels.count()

13150793