In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [2]:
from pyspark import pandas as ps
import pyspark.sql.types as T



In [3]:
session = create_spark_session("ArangoDB GitHub", SparkConnector.ARANGO)

Added dependencies: 
 ['arangodb-java-driver-shaded-7.1.0.jar', 'arangodb-spark-commons-3.3_2.12-1.5.0.jar', 'arangodb-spark-datasource-3.3_2.12-1.5.0.jar', 'commons-codec-1.11.jar', 'commons-logging-1.2.jar', 'httpclient-4.5.13.jar', 'httpcore-4.4.13.jar', 'jackson-dataformat-velocypack-4.1.0.jar', 'slf4j-api-2.0.7.jar']


In [None]:
# Scenario 1

options = get_default_options(SparkConnector.ARANGO)
options["query"] = "LET distinctValues = (\
                        FOR c IN GitContributor\
                            FOR commit IN OUTBOUND c AUTHOR\
                                FOR r IN OUTBOUND commit BELONGS_TO\
                                    RETURN DISTINCT{c, r})\
                    FOR d in distinctValues\
                        COLLECT contrib = d.c.name WITH COUNT INTO repo_count\
                        SORT repo_count DESC\
                        FILTER repo_count > 1\
                        LIMIT 10\
                        RETURN {contrib, repo_count}"

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.take(20))

In [None]:
# Find contributors who have contributed to the largest number of repositories

#options = get_default_options(SparkConnector.ARANGO)
#options["query"] = "FOR contributor IN GitContributor\
#                        FILTER contributor._key == 'f08905f3496a7b7d60e0da97307f6ad7594abc92::users.noreply.github.com'\
#                        FOR commit IN OUTBOUND contributor COMMITTED\
#                            RETURN {contributor: contributor, commit: commit}"

#df = spark_read(SparkConnector.ARANGO, session, options=options)
#display(df.take(20))

In [10]:
# Scenario 2
LANGUAGE = "JavaScript"
BYTES_PERCENTAGE = 0.5

query = f"""
    FOR repo IN GitRepository
    LET repoTotalBytes = (
    FOR lan IN OUTBOUND repo WRITTEN_IN
        LET byteInfo = (
                FOR info IN WRITTEN_IN
                FILTER info._from == repo._id AND info._to == lan._id
                RETURN info.bytes
            )
        COLLECT repository = repo._key
        AGGREGATE repoTotalBytes = SUM(byteInfo[0])
        RETURN {{repository, repoTotalBytes}}
        )
    FILTER LENGTH(repoTotalBytes) > 0 //for the repos with 0 WRITTEN_IN edges. FIX mini-batch
    
    FOR lan IN OUTBOUND repo WRITTEN_IN
        LET byteInfo = (
          FOR info IN WRITTEN_IN
            FILTER info._from == repo._id AND info._to == lan._id
            RETURN info.bytes
        )
        COLLECT repo_name = repo._key, language = lan._key, percentageOfBytes = (byteInfo[0]/repoTotalBytes[0].repoTotalBytes)
        FILTER language == "{LANGUAGE}" AND percentageOfBytes > {BYTES_PERCENTAGE}
        RETURN {{
          repo_name, 
          language,
          percentageOfBytes
        }}
        """

options = get_default_options(SparkConnector.ARANGO)
options["query"] = query

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.take(20))

Dataframe loaded from arangodb


[Row(language='JavaScript', percentageOfBytes=0.9577282863121208, repo_name='facebook::react'),
 Row(language='JavaScript', percentageOfBytes=0.6090344172523278, repo_name='FreeCodeCamp::FreeCodeCamp')]

In [6]:
# Scenario 3
REPO_NAME = "tensorflow::tensorflow"

query = f"""
        FOR repo IN GitRepository
            FILTER repo._key == "{REPO_NAME}"
            FOR commit IN INBOUND repo BELONGS_TO
                LET parents = ( 
                FOR parent IN OUTBOUND commit PARENT
                    COLLECT comm = commit._key INTO parents
                    RETURN {{lun: length(parents), comm}}
                )
            FILTER parents[0].lun>1 AND parents[0].comm == commit._key
            COLLECT WITH COUNT INTO n_merge
            RETURN {{num_merge: n_merge}}
        """

options = get_default_options(SparkConnector.ARANGO)
options["query"] = query

df = spark_read(SparkConnector.ARANGO, session, options=options)
display(df.take(20))

Dataframe loaded from arangodb


[Row(num_merge=12127)]

In [None]:
session.sparkContext.stop()
session.stop()