## Importing library and creating spark-session

In [2]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F
import pyspark.sql.types as T

import time as time

Collecting PyGithub
  Downloading PyGithub-1.59.1-py3-none-any.whl (342 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.2/342.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pynacl>=1.4.0
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m820.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting deprecated
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting types-cryptography>=3.3.21
  Downloading types_cryptography-3.3.23.2-py3-none-any.whl (30 kB)
Collecting wrapt<2,>=1.10
  Downloading wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.7/75.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing coll

In [3]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

Added dependencies: 
 ['neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar']


In [4]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # "sample_" or ""
SUFFIX = "_100" # "_10" or "_100" or ""

### Reading from HDFS

In [13]:
load_start_time = time.time()
repositories_json = session.read.json(f"{HDFS_URL}/{PREFIX}repositories{SUFFIX}.json") \
    .withColumnRenamed("repo_name", "repo") \

repositories_csv = session.read.csv(f"{HDFS_URL}/repo_API_data.csv", header=True, inferSchema=True)
repositories_csv = repositories_csv.select("repo_name","stargazers_count","topics")

repositories = repositories_json.join(repositories_csv, repositories_json.repo == repositories_csv.repo_name, "left") \
    .select(repositories_json["repo"].alias("repo_name"), 
            repositories_json["watch_count"], repositories_csv["stargazers_count"], 
            repositories_csv["topics"])

languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages{SUFFIX}.json")

licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences{SUFFIX}.json")

commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits{SUFFIX}.json") # cambia se ti serve il dataset completo

load_end_time = time.time()

load_time = (load_end_time - load_start_time)

preprc_start_time = time.time()
git_commits = commits \
    .select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["id","title","message","committer_date","author_date"]
git_commits = git_commits \
    .toDF(*newColumns)

git_repositories = repositories \
    .withColumnRenamed("repo_name", "name")

git_languages = languages.withColumn("name", F.explode(languages["language.name"])) \
    .dropDuplicates(["name"]) \
    .select("name")

git_licenses = licences.select("license") \
    .withColumnRenamed("license","name") \
    .dropDuplicates(["name"])

git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"]) \
    .select("name","email")

belongs_to = commits.select("commit","repo")
contains = commits.select("repo","commit")

parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .dropDuplicates(["commit","parent"])

has = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"])

author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("seconds","ts")
author = author \
    .filter(author["email"] != "") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])

committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["email"] != "") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])

writted_in = languages.withColumn("lang", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("language", writted_in["lang.name"]) \
    .withColumn("bytes", writted_in["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes")

preproc_end_time = time.time()
preproc_time = (preproc_end_time - preprc_start_time)

writing_start_time = time.time()
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_repositories, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "email"
options["labels"] = ":GitContributor"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_contributor, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_languages, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_licenses, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_commits, "Overwrite", options=options)

options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "BELONGS_TO"
options["relationship.save.strategy"] = "keys"


options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitRepository"
options["relationship.target.node.keys"] = "repo:name"
options["relationship.target.save.mode"] = "Match"


spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "CONTAINS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitrRepository"
options["relationship.source.node.keys"] = "repo:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "commit:id"
options["relationship.target.save.mode"] = "Match"


spark_write(SparkConnector.NEO4J, contains, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "PARENT"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "parent:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, parent, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "HAS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLicense"
options["relationship.target.node.keys"] = "license:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, has, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "AUTHOR"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "ts"

options["relationship.source.labels"] = "GitContributor"
options["relationship.source.node.keys"] = "email:email"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "commit:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, author, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "COMMITTED"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "ts"

options["relationship.source.labels"] = "GitContributor"
options["relationship.source.node.keys"] = "email:email"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "commit:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, committed, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "WRITTED_IN"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "bytes"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLanguage"
options["relationship.target.node.keys"] = "language:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, writted_in, "Append", options=options)
writing_end_time = time.time()
writing_time = (writing_end_time - writing_start_time)

Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Load time: 0.34276530345280964 min
Preprocessing time: 0.003538235028584798 min
Writing time: 3.3934589425722756 min


In [14]:
print(f"Load time: {load_time} sec")
print(f"Preprocessing time: {preproc_time} sec")
print(f"Writing time: {writing_time} sec")

Load time: 20.56591820716858 sec
Preprocessing time: 0.2122941017150879 sec
Writing time: 203.60753655433655 sec


In [15]:
# Scenario 1
start_time = time.time()
options = get_default_options(SparkConnector.NEO4J)
options["query"] =  """
                        MATCH (contrib:GitContributor)-[:AUTHOR]->(commit:GitCommit)-[:BELONGS_TO]->(repo:GitRepository)
                        WITH contrib, COUNT(DISTINCT repo) as repo_count
                        RETURN contrib, repo_count ORDER BY repo_count DESC
                    """
top10contributors = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 1: {end_time - start_time} sec")
display(top10contributors.take(10))

Dataframe loaded from neo4j
Scenario 1: 1.9277441501617432 sec


[Row(contrib=Row(<id>=678, <labels>=['GitContributor'], name='dependabot[bot]', email='1c358da00a777d4e9898c1280ab801e2df165188@users.noreply.github.com'), repo_count=25),
 Row(contrib=Row(<id>=10014, <labels>=['GitContributor'], name='Ikko Ashimine', email='d41f8067726d843438db002d5555099b4901d7c1@gmail.com'), repo_count=16),
 Row(contrib=Row(<id>=21466, <labels>=['GitContributor'], name='Prayag Verma', email='35a46e17bc00e93336a001ea5a30f33595fd0d03@gmail.com'), repo_count=16),
 Row(contrib=Row(<id>=33615, <labels>=['GitContributor'], name='Josh Soref', email='dc510c92cc1794ea84000fde88becdce67bf7624@users.noreply.github.com'), repo_count=13),
 Row(contrib=Row(<id>=25567, <labels>=['GitContributor'], name='ReadmeCritic', email='76d2ec468599ff7a19f706781bc84ca0c636b360@gmail.com'), repo_count=12),
 Row(contrib=Row(<id>=1386, <labels>=['GitContributor'], name='jamesgeorge007', email='3e5d4505bd1e679d62cbd9e85b63ce0b6e249349@gmail.com'), repo_count=9),
 Row(contrib=Row(<id>=34726, <labe

In [16]:
# Scenario 2:

start_time = time.time()
LANGUAGE = "C++"
PERCENTAGE = 0.5
options = get_default_options(SparkConnector.NEO4J)
options["query"] = f"""
                    MATCH (r:GitRepository)-[w:WRITTED_IN]->(l:GitLanguage)
                    WITH r, SUM(w.bytes) AS totalBytesForRepo, collect({{language_name:l.name,bytes: w.bytes}}) AS bytesForLanguages
                    UNWIND bytesForLanguages AS bytesForLanguage
                    WITH r.name AS repo_name, bytesForLanguage.language_name AS lang, round((bytesForLanguage.bytes*1.0/totalBytesForRepo),2) AS percOfBytes
                        WHERE lang = "{LANGUAGE}" AND percOfBytes > {PERCENTAGE}
                    RETURN repo_name, lang, percOfBytes 
                  """
bytesPercentageInRepos = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 2: {end_time - start_time} sec")
display(bytesPercentageInRepos.take(10))

Dataframe loaded from neo4j
Scenario 2: 0.3414914608001709 sec


[Row(repo_name='tensorflow/tensorflow', lang='C++', percOfBytes=0.63),
 Row(repo_name='electron/electron', lang='C++', percOfBytes=0.58)]

In [17]:
# Scenario 3:
start_time = time.time()
REPO_NAME = "tensorflow/tensorflow"
options = get_default_options(SparkConnector.NEO4J)
options["query"] =  f"""
                        MATCH (repository:GitRepository {{name: "{REPO_NAME}"}})<-[:BELONGS_TO]-(commit:GitCommit), 
                            r = (commit)-[:PARENT]->()
                        WITH commit, collect(r) AS parents
                        WHERE size(parents) > 1
                        RETURN count(commit) AS mergeCount
                    """
bytesPercentageInRepos = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 3: {end_time - start_time} sec")
bytesPercentageInRepos.show(10)

Dataframe loaded from neo4j
Scenario 3: 0.6592416763305664 sec
+----------+
|mergeCount|
+----------+
|     12127|
+----------+



In [6]:
# Scenario 4 - project graph contributors, commits and repos:
query = """     
        CALL gds.graph.project.cypher(
              'contribRepoAndCommits',
              'MATCH (n) WHERE n:GitContributor OR n:GitCommit OR n:GitRepository RETURN ID(n) AS id',
              'MATCH (n)-[r]->(m) WHERE r:PARENT OR r:BELONGS_TO OR r:COMMITTED OR r:AUTHOR RETURN ID(n) AS source, ID(m) AS target'
        )
        """
PROJ_NAME="contribRepoAndCommits"
raise Exception("Put query text in the neo4j GUI")


Exception: Put query text in the neo4j GUI

In [19]:
options = get_default_options(SparkConnector.NEO4J)
options["query"] =  f"""
                        MATCH (n)-[r]->(m) WHERE r:PARENT OR r:BELONGS_TO OR r:COMMITTED OR r:AUTHOR RETURN ID(n) AS source, ID(m) AS target 
                    """
louvain = spark_read(SparkConnector.NEO4J, session, options=options)
louvain.write.option("header", True).mode("overwrite").csv("hdfs://namenode:9000//data-team/edges")

Dataframe loaded from neo4j


In [8]:
# Scenario 4:
start_time = time.time()
PROJ_NAME = "contribRepoAndCommits"

options = get_default_options(SparkConnector.NEO4J)
options["query"] =  f"""
                        CALL gds.labelPropagation.stream('{PROJ_NAME}')
                        YIELD nodeId, communityId
                        RETURN nodeId as ID, communityId 
                    """
louvain = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 4: {end_time - start_time} sec")
louvain.write.option("header", True).mode("overwrite").csv("hdfs://namenode:9000//data-team/louvain")

Dataframe loaded from neo4j
Scenario 4: 7.501660585403442 sec


In [9]:
start_time = time.time()
options = get_default_options(SparkConnector.NEO4J)
options["query"] = f"""
                        CALL gds.pageRank.stream('contribRepoAndCommits')
                        YIELD nodeId, score
                        RETURN nodeId, score
                        ORDER BY score DESC
                    """
louvain = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 5: {end_time - start_time} sec")

Dataframe loaded from neo4j
Scenario 5: 2.8513994216918945 sec


In [10]:
session.sparkContext.stop()
session.stop()