# Graph Analytics

This notebook has been created to test graph analytics capabilities of GraphFrame on a sample dataset.

In [1]:
# Import libraries 
import graphframe as GF
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col

In [2]:
# Create spark context
sparkConf = (SparkConf().setMaster("local").setAppName("SocialGraph").set("spark.executor.memory", "2g").set("spark.executor.instances", "4"))
sparkContext = SparkContext(conf=sparkConf)
sql_context = SQLContext(sparkContext)

In [3]:
# Create Spark dataframe for group members data
df_group_members = sql_context.read.format('com.databricks.spark.csv').options(header='true').load('data/group_members.csv')

In [4]:
df_group_members.show(5)

+--------------------+--------------------+--------------------+--------------------+--------+--------+
|         create_time|         update_time|            group_id|             user_id|is_admin|added_by|
+--------------------+--------------------+--------------------+--------------------+--------+--------+
|2016-06-25 12:12:...|2016-06-25 12:12:...|c8a5ca5947cc7d925...|fcaa201f972dfc428...|       f|    null|
|2018-03-26 18:30:...|2018-03-26 18:30:...|d2c8410bb78af4615...|26cf4a8044866c1bc...|       f|    null|
|2017-12-18 10:23:...|2017-12-18 10:23:...|2c07f080d9f041295...|570a2ddfdccc2b775...|       f|    null|
|2018-02-08 16:58:...|2018-02-08 16:58:...|1430c9033779d7585...|df8d3021822ef6fed...|       f|    null|
|2018-03-26 18:30:...|2018-03-26 18:30:...|d2c8410bb78af4615...|777862a1db383bcd7...|       f|    null|
+--------------------+--------------------+--------------------+--------------------+--------+--------+
only showing top 5 rows



In [5]:
# Select vertices column from Spark dataframe
df_users = df_group_members.select(['user_id'])
df_users = df_users.selectExpr("user_id as id") 

In [6]:
# Remove duplicate user_id entries and create vertices dataframe
vertices=df_users.drop_duplicates()

In [7]:
vertices.take(5)

[Row(id=u'b84ececc601c97d81eb0be2cac4b7950b3c2feb0'),
 Row(id=u'b988970fb04bc223621b0b8dbb8d3be615d1c7eb'),
 Row(id=u'e8c49ddb70edc78009cbd80195aca9938c415f87'),
 Row(id=u'21e9d66847d7c6392b9fcd105db203304e32416a'),
 Row(id=u'8f8cbcd1a3364ba1d44102963fc11f6bd09e1be8')]

In [24]:
# Create the edges dataframe
edges = df_group_members.select(col('user_id').alias('src'),col('group_id')).join(df_group_members.select(col('user_id').alias('dst'),col('group_id')), on=['group_id'], how='outer')
edges = edges.select(col('src'),col('dst'),col('group_id')).filter(edges.src != edges.dst)

In [23]:
edges.take(5)

[]

In [None]:
# Generate the graph
graph = GF.GraphFrame(vertices, edges)

In [None]:
del vertices, edges, df_users

In [None]:
# Save the graph to file
#graph.vertices.write.parquet('store/gv.parquet')
#graph.edges.write.parquet('store/ge.parquet')

In [None]:
# Read the graph from file
#vertices = spark.read.parquet('store/gv.parquet')
#edges = spark.read.parquet('store/ge.parquet')

#graph = GraphFrame(vertices, edges)
#graph.vertices.show(5)
#graph.edges.show(5)

In [None]:
# Fetch 1st connects and 2nd connects

In [None]:
# Create Spark dataframe for group to channel mapping data
df_group_members = sql_context.read.format('com.databricks.spark.csv').options(header='true').load('data/group_channel.csv')

In [None]:
df_group_members.show(5)