In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [2]:
from pyspark import pandas as ps
import pyspark.sql.types as T
from arango import ArangoClient



In [4]:
# Scenario 4 - creating job

# Initialize the ArangoDB client.
client = ArangoClient(hosts='http://arangodb:8529')

# Connect to "test" database as root user.
db = client.db('_system', username='root', password='')

# Get the Pregel API wrapper.
pregel = db.pregel

# Start a new Pregel job in "school" graph.
job_id = db.pregel.create_job(
    graph='github_graph',
    algorithm='labelpropagation',
    store=False,
    max_gss=250,
    thread_count=1,
    async_mode=False,
    result_field='community'
)

# Retrieve details of a Pregel job by ID.
job = pregel.job(job_id)

In [5]:
session = create_spark_session("ArangoDB GitHub", SparkConnector.ARANGO)

Added dependencies: 
 ['arangodb-java-driver-shaded-7.1.0.jar', 'arangodb-spark-commons-3.3_2.12-1.5.0.jar', 'arangodb-spark-datasource-3.3_2.12-1.5.0.jar', 'commons-codec-1.11.jar', 'commons-logging-1.2.jar', 'httpclient-4.5.13.jar', 'httpcore-4.4.13.jar', 'jackson-dataformat-velocypack-4.1.0.jar', 'slf4j-api-2.0.7.jar']


In [55]:
# Scenario 1

options = get_default_options(SparkConnector.ARANGO)
options["query"] = "LET distinctValues = (\
                        FOR c IN GitContributor\
                            FOR commit IN OUTBOUND c AUTHOR\
                                FOR r IN OUTBOUND commit BELONGS_TO\
                                    RETURN DISTINCT{c, r})\
                    FOR d in distinctValues\
                        COLLECT contrib = d.c.name WITH COUNT INTO repo_count\
                        SORT repo_count DESC\
                        FILTER repo_count > 1\
                        LIMIT 10\
                        RETURN {contrib, repo_count}"

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.take(20))

Dataframe loaded from arangodb


[Row(contrib='Ikko Ashimine', repo_count=4),
 Row(contrib='dependabot[bot]', repo_count=3),
 Row(contrib='MichaÃ«l De Boey', repo_count=3),
 Row(contrib='Kohei TAKATA', repo_count=3),
 Row(contrib='Ronald Eddy Jr', repo_count=3),
 Row(contrib='C. T. Lin', repo_count=3),
 Row(contrib='James George', repo_count=3),
 Row(contrib='Prayag Verma', repo_count=3),
 Row(contrib='James Reggio', repo_count=2),
 Row(contrib='Shingo Sato', repo_count=2)]

In [None]:
# Find contributors who have contributed to the largest number of repositories

#options = get_default_options(SparkConnector.ARANGO)
#options["query"] = "FOR contributor IN GitContributor\
#                        FILTER contributor._key == 'f08905f3496a7b7d60e0da97307f6ad7594abc92::users.noreply.github.com'\
#                        FOR commit IN OUTBOUND contributor COMMITTED\
#                            RETURN {contributor: contributor, commit: commit}"

#df = spark_read(SparkConnector.ARANGO, session, options=options)
#display(df.take(20))

In [10]:
# Scenario 2
LANGUAGE = "JavaScript"
BYTES_PERCENTAGE = 0.5

query = f"""
    FOR repo IN GitRepository
    LET repoTotalBytes = (
    FOR lan IN OUTBOUND repo WRITTEN_IN
        LET byteInfo = (
                FOR info IN WRITTEN_IN
                FILTER info._from == repo._id AND info._to == lan._id
                RETURN info.bytes
            )
        COLLECT repository = repo._key
        AGGREGATE repoTotalBytes = SUM(byteInfo[0])
        RETURN {{repository, repoTotalBytes}}
        )
    FILTER LENGTH(repoTotalBytes) > 0 //for the repos with 0 WRITTEN_IN edges. FIX mini-batch
    
    FOR lan IN OUTBOUND repo WRITTEN_IN
        LET byteInfo = (
          FOR info IN WRITTEN_IN
            FILTER info._from == repo._id AND info._to == lan._id
            RETURN info.bytes
        )
        COLLECT repo_name = repo._key, language = lan._key, percentageOfBytes = (byteInfo[0]/repoTotalBytes[0].repoTotalBytes)
        FILTER language == "{LANGUAGE}" AND percentageOfBytes > {BYTES_PERCENTAGE}
        RETURN {{
          repo_name, 
          language,
          percentageOfBytes
        }}
        """

options = get_default_options(SparkConnector.ARANGO)
options["query"] = query

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.take(20))

Dataframe loaded from arangodb


[Row(language='JavaScript', percentageOfBytes=0.9577282863121208, repo_name='facebook::react'),
 Row(language='JavaScript', percentageOfBytes=0.6090344172523278, repo_name='FreeCodeCamp::FreeCodeCamp')]

In [6]:
# Scenario 3
REPO_NAME = "tensorflow::tensorflow"

query = f"""
        FOR repo IN GitRepository
            FILTER repo._key == "{REPO_NAME}"
            FOR commit IN INBOUND repo BELONGS_TO
                LET parents = ( 
                FOR parent IN OUTBOUND commit PARENT
                    COLLECT comm = commit._key INTO parents
                    RETURN {{lun: length(parents), comm}}
                )
            FILTER parents[0].lun>1 AND parents[0].comm == commit._key
            COLLECT WITH COUNT INTO n_merge
            RETURN {{num_merge: n_merge}}
        """

options = get_default_options(SparkConnector.ARANGO)
options["query"] = query

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.take(20))

Dataframe loaded from arangodb


[Row(num_merge=12127)]

In [23]:
job["id"]


'6747565'

In [6]:
# Scenario 4 - getting job result
query = f"""
        FOR v IN PREGEL_RESULT({job["id"]})
        RETURN {{key: v._key,
                community: v.community}}
        """

options = get_default_options(SparkConnector.ARANGO)
options["query"] = query

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.show(20))

Dataframe loaded from arangodb
+---------+--------------------+
|community|                 key|
+---------+--------------------+
|        0|32ccc6a0699d6cdb4...|
|   214052|ec7ef50e8b7a61639...|
|     5287|e14bd919f4190cda8...|
|        3|7841569d4d48db86b...|
|        4|71f558ffc155d70e0...|
|        5|4d5677a66efdc13a4...|
|     6754|5b70f3abdf2b74808...|
|     1837|15502617a56103750...|
|     4657|5ab86342e63388663...|
|     7332|e06e0764e33dd3840...|
|      829|f4e223bcebdbda8a4...|
|       11|e193db130bc4d662c...|
|      610|d5fd0b2931c85ac39...|
|       13|1047980dca0830cd5...|
|       14|b11dd87047f40c4c4...|
|      717|f27624a1e9e89dca3...|
|       16|a4ca98ed322d4290f...|
|     4698|be8b0acdd0cee84c5...|
|       18|0e42ad5756e4675b5...|
|       19|e90710ad4640668fa...|
+---------+--------------------+
only showing top 20 rows



None

In [None]:
# Scenario 5:
job_id = db.pregel.create_job(
    graph='github_graph',
    algorithm='pagerank',
    store=False,
    max_gss=250,
    thread_count=1,
    async_mode=False,
    result_field='result',
    algorithm_params={'threshold': 0.000001}
)

# Retrieve details of a Pregel job by ID.
job = pregel.job(job_id)

query = f"""
        FOR v IN PREGEL_RESULT({job["id"]})
        RETURN {{key: v._key,
                rank: v.result}}
        """

options = get_default_options(SparkConnector.ARANGO)
options["query"] = query

df = spark_read(SparkConnector.ARANGO, session, options=options)

In [25]:
session.sparkContext.stop()
session.stop()