## Importing library and creating spark-session

In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F
import pyspark.sql.types as T

import time as time

In [2]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

Added dependencies: 
 ['neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar']


In [3]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # "sample_" or ""
SUFFIX = "_1000" # "_10" or "_100" or ""

### Reading from HDFS

In [5]:
load_start_time = time.time()
repositories_json = session.read.json(f"{HDFS_URL}/{PREFIX}repositories{SUFFIX}.json") \
    .withColumnRenamed("repo_name", "repo") \

repositories_csv = session.read.csv(f"{HDFS_URL}/repo_API_data.csv", header=True, inferSchema=True)
repositories_csv = repositories_csv.select("repo_name","stargazers_count","topics")

repositories = repositories_json.join(repositories_csv, repositories_json.repo == repositories_csv.repo_name, "left") \
    .select(repositories_json["repo"].alias("repo_name"), 
            repositories_json["watch_count"], repositories_csv["stargazers_count"], 
            repositories_csv["topics"])

languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages{SUFFIX}.json")

licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences{SUFFIX}.json")

commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits{SUFFIX}.json") # cambia se ti serve il dataset completo

load_end_time = time.time()

load_time = (load_end_time - load_start_time)

preprc_start_time = time.time()
git_commits = commits \
    .select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["id","title","message","committer_date","author_date"]
git_commits = git_commits \
    .toDF(*newColumns)

git_repositories = repositories \
    .withColumnRenamed("repo_name", "name")

git_languages = languages.withColumn("name", F.explode(languages["language.name"])) \
    .dropDuplicates(["name"]) \
    .select("name")

git_licenses = licences.select("license") \
    .withColumnRenamed("license","name") \
    .dropDuplicates(["name"])

git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"]) \
    .select("name","email")

belongs_to = commits.select("commit","repo")
contains = commits.select("repo","commit")

parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .dropDuplicates(["commit","parent"])

has = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"])

author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("seconds","ts")
author = author \
    .filter(author["email"] != "") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])

committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["email"] != "") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])

writted_in = languages.withColumn("lang", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("language", writted_in["lang.name"]) \
    .withColumn("bytes", writted_in["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes")

preproc_end_time = time.time()
preproc_time = (preproc_end_time - preprc_start_time)

writing_start_time = time.time()
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_repositories, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "email"
options["labels"] = ":GitContributor"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_contributor, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_languages, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_licenses, "Overwrite", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_commits, "Overwrite", options=options)

options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "BELONGS_TO"
options["relationship.save.strategy"] = "keys"


options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitRepository"
options["relationship.target.node.keys"] = "repo:name"
options["relationship.target.save.mode"] = "Match"


spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "CONTAINS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitrRepository"
options["relationship.source.node.keys"] = "repo:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "commit:id"
options["relationship.target.save.mode"] = "Match"


spark_write(SparkConnector.NEO4J, contains, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "PARENT"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "parent:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, parent, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "HAS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLicense"
options["relationship.target.node.keys"] = "license:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, has, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "AUTHOR"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "ts"

options["relationship.source.labels"] = "GitContributor"
options["relationship.source.node.keys"] = "email:email"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "commit:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, author, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "COMMITTED"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "ts"

options["relationship.source.labels"] = "GitContributor"
options["relationship.source.node.keys"] = "email:email"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "commit:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, committed, "Append", options=options)
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "WRITTED_IN"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "bytes"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLanguage"
options["relationship.target.node.keys"] = "language:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, writted_in, "Append", options=options)
writing_end_time = time.time()
writing_time = (writing_end_time - writing_start_time)

Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J
Dataframe saved to NEO4J


In [6]:
print(f"Load time: {load_time} sec")
print(f"Preprocessing time: {preproc_time} sec")
print(f"Writing time: {writing_time} sec")

Load time: 592.001033782959 sec
Preprocessing time: 1.1469089984893799 sec
Writing time: 4111.822649002075 sec


In [7]:
# Scenario 1
start_time = time.time()
options = get_default_options(SparkConnector.NEO4J)
options["query"] =  """
                        MATCH (contrib:GitContributor)-[:AUTHOR]->(commit:GitCommit)-[:BELONGS_TO]->(repo:GitRepository)
                        WITH contrib, COUNT(DISTINCT repo) as repo_count
                        RETURN contrib, repo_count ORDER BY repo_count DESC
                    """
top10contributors = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 1: {end_time - start_time} sec")
display(top10contributors.take(10))

Dataframe loaded from neo4j
Scenario 1: 163.90637874603271 sec


[Row(contrib=Row(<id>=26033, <labels>=['GitContributor'], name='dependabot[bot]', email='1c358da00a777d4e9898c1280ab801e2df165188@users.noreply.github.com'), repo_count=176),
 Row(contrib=Row(<id>=53118, <labels>=['GitContributor'], name='Prayag Verma', email='35a46e17bc00e93336a001ea5a30f33595fd0d03@gmail.com'), repo_count=94),
 Row(contrib=Row(<id>=47589, <labels>=['GitContributor'], name='The Gitter Badger', email='4e199b4a1c40b497a95fcd1cd896351733849949@gitter.im'), repo_count=81),
 Row(contrib=Row(<id>=136378, <labels>=['GitContributor'], name='Ikko Ashimine', email='d41f8067726d843438db002d5555099b4901d7c1@gmail.com'), repo_count=75),
 Row(contrib=Row(<id>=69981, <labels>=['GitContributor'], name='ReadmeCritic', email='76d2ec468599ff7a19f706781bc84ca0c636b360@gmail.com'), repo_count=74),
 Row(contrib=Row(<id>=85625, <labels>=['GitContributor'], name='Tim Gates', email='af3615aeb885952b6b34e8cd8afd1feb0f18c9a5@iress.com'), repo_count=49),
 Row(contrib=Row(<id>=95221, <labels>=['G

In [8]:
# Scenario 2:

start_time = time.time()
LANGUAGE = "C++"
PERCENTAGE = 0.5
options = get_default_options(SparkConnector.NEO4J)
options["query"] = f"""
                    MATCH (r:GitRepository)-[w:WRITTED_IN]->(l:GitLanguage)
                    WITH r, SUM(w.bytes) AS totalBytesForRepo, collect({{language_name:l.name,bytes: w.bytes}}) AS bytesForLanguages
                    UNWIND bytesForLanguages AS bytesForLanguage
                    WITH r.name AS repo_name, bytesForLanguage.language_name AS lang, round((bytesForLanguage.bytes*1.0/totalBytesForRepo),2) AS percOfBytes
                        WHERE lang = "{LANGUAGE}" AND percOfBytes > {PERCENTAGE}
                    RETURN repo_name, lang, percOfBytes 
                  """
bytesPercentageInRepos = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 2: {end_time - start_time} sec")
display(bytesPercentageInRepos.take(10))

Dataframe loaded from neo4j
Scenario 2: 0.6484222412109375 sec


[Row(repo_name='tensorflow/tensorflow', lang='C++', percOfBytes=0.63),
 Row(repo_name='godotengine/godot', lang='C++', percOfBytes=0.88),
 Row(repo_name='grpc/grpc', lang='C++', percOfBytes=0.65),
 Row(repo_name='google/ion', lang='C++', percOfBytes=0.9),
 Row(repo_name='google/googletest', lang='C++', percOfBytes=0.9),
 Row(repo_name='alibaba/AndFix', lang='C++', percOfBytes=0.62),
 Row(repo_name='facebook/redex', lang='C++', percOfBytes=0.87),
 Row(repo_name='facebook/folly', lang='C++', percOfBytes=0.95),
 Row(repo_name='johang/btfs', lang='C++', percOfBytes=0.85),
 Row(repo_name='tesseract-ocr/tesseract', lang='C++', percOfBytes=0.96)]

In [9]:
# Scenario 3:
start_time = time.time()
REPO_NAME = "tensorflow/tensorflow"
options = get_default_options(SparkConnector.NEO4J)
options["query"] =  f"""
                        MATCH (repository:GitRepository {{name: "{REPO_NAME}"}})<-[:BELONGS_TO]-(commit:GitCommit), 
                            r = (commit)-[:PARENT]->()
                        WITH commit, collect(r) AS parents
                        WHERE size(parents) > 1
                        RETURN count(commit) AS mergeCount
                    """
bytesPercentageInRepos = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 3: {end_time - start_time} sec")
bytesPercentageInRepos.show(10)

Dataframe loaded from neo4j
Scenario 3: 0.8158688545227051 sec
+----------+
|mergeCount|
+----------+
|     12127|
+----------+



In [10]:
# Scenario 4 - project graph contributors, commits and repos:
query = """     
        CALL gds.graph.project.cypher(
              'contribRepoAndCommits',
              'MATCH (n) WHERE n:GitContributor OR n:GitCommit OR n:GitRepository RETURN ID(n) AS id',
              'MATCH (n)-[r]->(m) WHERE r:PARENT OR r:BELONGS_TO OR r:COMMITTED OR r:AUTHOR RETURN ID(n) AS source, ID(m) AS target'
        )
        """
PROJ_NAME="contribRepoAndCommits"
raise Exception("Put query text in the neo4j GUI")


Exception: Put query text in the neo4j GUI

In [11]:
options = get_default_options(SparkConnector.NEO4J)
options["query"] =  f"""
                        MATCH (n)-[r]->(m) WHERE r:PARENT OR r:BELONGS_TO OR r:COMMITTED OR r:AUTHOR RETURN ID(n) AS source, ID(m) AS target 
                    """
louvain = spark_read(SparkConnector.NEO4J, session, options=options)
louvain.write.option("header", True).mode("overwrite").csv("hdfs://namenode:9000//data-team/edges")

Dataframe loaded from neo4j


In [12]:
# Scenario 4:
start_time = time.time()
PROJ_NAME = "contribRepoAndCommits"

options = get_default_options(SparkConnector.NEO4J)
options["query"] =  f"""
                        CALL gds.labelPropagation.stream('{PROJ_NAME}')
                        YIELD nodeId, communityId
                        RETURN nodeId as ID, communityId 
                    """
louvain = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 4: {end_time - start_time} sec")
louvain.write.option("header", True).mode("overwrite").csv("hdfs://namenode:9000//data-team/louvain")

Dataframe loaded from neo4j
Scenario 4: 123.08715415000916 sec


In [4]:
start_time = time.time()
options = get_default_options(SparkConnector.NEO4J)
options["query"] = f"""
                        CALL gds.pageRank.stream('contribRepoAndCommits')
                        YIELD nodeId, score
                        RETURN gds.util.asNode(nodeId).name as name, score
                        ORDER BY score DESC
                    """
louvain = spark_read(SparkConnector.NEO4J, session, options=options)
end_time = time.time()
print(f"Scenario 5: {end_time - start_time} sec")

Dataframe loaded from neo4j
Scenario 5: 350.89533853530884 sec


In [5]:
display(louvain.take(10))

[Row(name='tensorflow/tensorflow', score=15630.063491839835),
 Row(name='apple/swift', score=15427.057959425001),
 Row(name='kubernetes/kubernetes', score=12170.155320558157),
 Row(name='dotnet/roslyn', score=10044.6008681273),
 Row(name='Microsoft/vscode', score=9750.781070613648),
 Row(name='rails/rails', score=9693.129889366524),
 Row(name='Homebrew/homebrew', score=7593.8009527367085),
 Row(name='symfony/symfony', score=6869.918907495444),
 Row(name='ansible/ansible', score=6393.512080307458),
 Row(name='golang/go', score=6262.886339651076)]

In [6]:
session.sparkContext.stop()
session.stop()