# Graph Analytics

This notebook has been created to test graph analytics capabilities of GraphFrame on a sample dataset.

In [None]:
# Import libraries 
import graphframe as GF
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col

In [None]:
# Create spark context
sparkConf = (SparkConf().setMaster("local").setAppName("SocialGraph").set("spark.executor.memory", "2g").set("spark.executor.instances", "4"))
sparkContext = SparkContext(conf=sparkConf)
sql_context = SQLContext(sparkContext)

In [None]:
# Create Spark dataframe for group members data
df_group_members = sql_context.read.format('com.databricks.spark.csv').options(header='true').load('group_members.csv')

In [None]:
df_group_members.show(5)

In [None]:
# Select vertices column from Spark dataframe
df_users = df_group_members.select(['user_id'])
df_users = df_users.selectExpr("user_id as id") 

In [None]:
# Remove duplicate user_id entries and create vertices dataframe
vertices=df_users.drop_duplicates()

In [None]:
vertices.take(5)

In [None]:
# Create the edges dataframe
edges = df_group_members.select(col('user_id').alias('src'),col('group_id')).join(df_group_members.select(col('user_id').alias('dst'),col('group_id')), on=['group_id'], how='outer')

In [None]:
edges.take(5)

In [None]:
# Generate the graph
graph = GF.GraphFrame(vertices, edges)

In [None]:
# Save the graph to file
graph.vertices.write.parquet('store/gv.parquet')
graph.edges.write.parquet('store/ge.parquet')