# Graph Analytics

This notebook has been created to test graph analytics capabilities of GraphFrame on a sample dataset.

In [None]:
# Import libraries 
import graphframe as GF
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, size, lit, collect_list

In [None]:
''' 
Function: Create spark context
Parameters: app_name, executor_memory, no_of_executors
Returns: Spark SQLContext
'''
def createContext(app_name="SocialGraph", executor_memory="2g", no_of_executors="4"):
    sparkConf = (SparkConf().setMaster("local").setAppName(app_name).set("spark.executor.memory", executor_memory).set("spark.executor.instances", no_of_executors))
    sparkContext = SparkContext(conf=sparkConf)
    sql_context = SQLContext(sparkContext)
    return sql_context

In [None]:
'''
Function: Create Spark dataframe from csv file
Parameters: file_path,sql_context
Returns: Pyspark Dataframe
'''
def loadData(file_path,sql_context):
    df = sql_context.read.format('com.databricks.spark.csv').options(header='true').load(file_path)
    return df

In [None]:
'''
Function: Provide stats for a Pyspark Dataframe
Parameters: Pyspark Dataframe
Returns: N/A
'''
def getStats(df):
    print("\nRow count: %d\n\nColumn Count: %d\n\nColumn headers: %s\n\nSample Data:\n" %(df.count(),len(df.columns),df.columns))
    df.show(5)  

In [None]:
'''
Function: Create vertices dataframe
Parameters: Pyspark Dataframe
Returns: Pyspark dataframe depciting vertices
'''
def createVertices(df):
    print("Creating Vertices DataFrame..")
    # Select user_Id [vertices] column from Spark dataframe
    df_users = df.select(['user_id'])
    df_users = df_users.selectExpr("user_id as id") 
    # Remove duplicate user_id entries and create vertices dataframe
    vertices = df_users.drop_duplicates()
    print("Vertices DataFrame creation complete.")
    return vertices

In [None]:
'''
Function: Create edges dataframe
Parameters: Pyspark Dataframe
Returns: Pyspark dataframe depciting edges
'''
def createEdges(df):
    # Create the edges dataframe
    print("Creating Edges DataFrame..")
    edges = df.select(col('user_id').alias('src'),col('group_id')).join(df.select(col('user_id').alias('dst'),col('group_id')), on=['group_id'], how='outer')
    #edges = edges.select(col('src'),col('dst'),col('group_id')).filter(edges.src != edges.dst)
    edges = edges.withColumn("relationship", lit("group_member"))
    edges = edges.select("src", "dst", "relationship","group_id").groupBy('src','dst','relationship').agg(collect_list('group_id').alias('common_groups'))
    edges = edges.select('*', size('common_groups').alias('weight'))
    print("Edges DataFrame creation complete.")
    return edges

In [None]:
'''
Function: Create graph
Parameters: Pyspark Dataframe - vertices, edges
Returns: GraphFrame
'''
def createGraph(vertices, edges):
    print("Creating graph..")
    # Generate the graph
    graph = GF.GraphFrame(vertices, edges)
    print("Graph creation complete.")
    return graph

In [None]:
'''
Function: Save graph to file
Parameters: GraphFrame
Returns: N/A
'''
def saveGraph(graph):
    # Save the graph to a file
    print("Saving graph to file..")
    graph.vertices.write.parquet('store/gv.parquet')
    graph.edges.write.parquet('store/ge.parquet')
    print("Graph has been saved successfully.")

In [None]:
'''
Function: Load graph from file
Parameters: N/A
Returns: GraphFrame
'''
def loadGraph(graph):
    # Load the graph from file
    print("\nLoading graph data..")
    vertices = spark.read.parquet('store/gv.parquet')
    edges = spark.read.parquet('store/ge.parquet')
    print("\Generating graph..")
    graph = GF.GraphFrame(vertices, edges)
    print("\nGraph load complete.")
    return graph

In [None]:
'''
Function: Obtain the first connects of a given vertex
Parameters: GraphFrame, vertex label
Returns: GraphFrame of connected vertices and their edges
'''
def first_connects(graph, vertex):
    first_connect_motifs = graph.find("(v1)-[e]->(v2)").filter("v1.id == '"+vertex+"'")
    return first_connect_motifs.select("v2.id","e")

In [None]:
'''
Function: Delete dataframe to free memory
Parameters: List of DataFrames
Returns: N/A
'''
def cleanUp(df_list):
    for df in df_list:
        del df
    print("\nDataFrame clean up complete.")

In [None]:
if __name__ == "__main__":
    
    # Create Spark context
    sql_context = createContext(app_name="SocialGraph", executor_memory="2g", no_of_executors="4")
    
    # Load group to member data into a Pyspark Dataframe
    df_group_members = loadData('data/group_members.csv',sql_context)
    
    # Load group to channel data into a Pyspark Dataframe
    df_group_channels = loadData('data/group_channel.csv',sql_context)
    
    # Get stats on group members and group channels dataframes
    getStats(df_group_members)
    getStats(df_group_channels)
    
    # Create vertices dataframe
    vertices = createVertices(df_group_members)
    
    # Create edges dataframe
    edges = createEdges(df_group_members)
    
    # Get stats on vertices and edges dataframes
    getStats(vertices)
    getStats(edges)
    
    # Create Graph
    duta_graph = createGraph(vertices,edges)
    
    # Clean up memory
    cleanUp([vertices, edges, df_group_members])
    
    # Save Graph to file 
    #saveGraph(duta_graph)
    
    # Load graph from file
    #duta_graph = loadGraph()