### Importing library and creating spark-session

In [None]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [None]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

In [None]:
common_URI = "hdfs://namenode:9000//data-team/"

### Reading from HDFS

In [None]:
files = session.read.json(f"{common_URI}files.json")
files.printSchema()

In [None]:
languages = session.read.json(f"{common_URI}languages.json")
languages.printSchema()

In [None]:
licences = session.read.json(f"{common_URI}licenses.json")
licences.printSchema()

In [None]:
commits = session.read.json(f"{common_URI}commits_min.json") # cambia se ti serve il dataset completo
commits.printSchema()

In [None]:
git_commits= commits.select("commit", "subject", "message", "committer.date", "author.date")

In [None]:
newColumns = ["id","title","message","committer_date", "author_date"]
git_commits=git_commits.toDF(*newColumns)
git_commits.printSchema()

In [None]:
repositories = session.read.json(f"{common_URI}repositories_min.json") # cambia se ti serve il dataset completo
repositories.printSchema()

In [None]:
git_repositories = repositories.withColumnRenamed("repo_name", "name")
git_repositories.printSchema()

In [None]:
git_languages = languages.withColumn("name",F.explode(languages["language.name"])).dropDuplicates(["name"]).select("name")
git_languages.printSchema()

In [None]:
git_licenses = licences.select("license").withColumnRenamed("license","name")
git_licenses.printSchema()

In [None]:
git_files = files.select("id","ref","path","mode","symlink_target")
git_files.printSchema()

### Getting a mini-batch of the dataset

In [None]:
repositories_min = repositories.sort("repo_name")
repositories_min = repositories_min.limit(100)
repositories_min.printSchema()
repositories_min.write.format("json").mode("overwrite").save(f"{common_URI}repositories_min.json")

In [None]:
commits_min = commits.join(repositories_min, commits["repo"]==repositories_min["repo_name"]).withColumnRenamed("repo_name","repo2").drop("repo_name").drop("repo2")
commits_min.printSchema()

In [None]:
commits_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}commits_min.json")

In [None]:
languages_min = languages.join(repositories_min, languages["repo_name"] == repositories_min["repo_name"]).select("language","name")

In [None]:
languages_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}languages_min.json")

In [None]:
licenses_min = licences.join(repositories_min, licences["repo_name"] == repositories_min["repo_name"]).select("license","name")

In [None]:
licenses_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}licenses_min.json")

In [None]:
files_min = files.join(repositories_min, files["repo_name"] == repositories_min["repo_name"]).select("id","ref","path","mode","symlink_target","name")

In [None]:
files_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}files_min.json")

### Writing the nodes in the graph

In [None]:
"""options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"

spark_write(SparkConnector.NEO4J, git_languages, "Overwrite", options=options)"""

In [None]:
"""options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"

spark_write(SparkConnector.NEO4J, git_licenses, "Overwrite", options=options)"""

In [None]:
"""options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitFile"

spark_write(SparkConnector.NEO4J, git_files, "Overwrite", options=options)"""

In [9]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"

spark_write(SparkConnector.NEO4J, git_commits, "Overwrite", options=options)

Dataframe saved to NEO4J


In [10]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"

spark_write(SparkConnector.NEO4J, git_repositories, "Overwrite", options=options)

Dataframe saved to NEO4J


### Writing the relationships in the graph

In [19]:
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    |-- old_path: string (nullable = true)
 |    |    

In [22]:
rel = commits.select("commit","repo")
rel.show(10)

+--------------------+--------------------+
|              commit|                repo|
+--------------------+--------------------+
|98da69bf13dc56259...|Automattic/wp-cal...|
|35a6b40e63459c80a...|FreeCodeCamp/Free...|
|c575b2d77ad47238f...|ButchersBoy/Mater...|
|6d5e3865e748dad30...|FreeCodeCamp/Free...|
|dfd09c9da391e57bf...|BrowserSync/brows...|
|1457ff59ad3921eeb...|Automattic/wp-cal...|
|4eb66ac3b5d9ed670...|BlackrockDigital/...|
|227175820b8feac8b...| CosmicMind/Material|
|38d71abda65d25d1d...|FreeCodeCamp/Free...|
|c0605249cd2891e58...|FreeCodeCamp/Free...|
+--------------------+--------------------+
only showing top 10 rows



In [None]:
options = get_default_options(SparkConnector.NEO4J)
options["relationship"] = "BELONGS_TO"
options["relationship.source.labels"] = "GitCommit"
options["relationship.save.strategy"] = "keys"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.target.labels"] = "GitRepository"
options["relationship.target.node.keys"] = "repo:name"

spark_write(SparkConnector.NEO4J, rel, "Overwrite", options=options)