### Importing library and creating spark-session

In [4]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [5]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

Added dependencies: 
 ['neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar']


In [6]:
common_URI = "hdfs://namenode:9000//data-team/"

### Reading from HDFS

In [7]:
files = session.read.json(f"{common_URI}files.json")
files.printSchema()

root
 |-- id: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- path: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- symlink_target: string (nullable = true)



In [8]:
languages = session.read.json(f"{common_URI}languages.json")
languages.printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bytes: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [9]:
licences = session.read.json(f"{common_URI}licenses.json")
licences.printSchema()

root
 |-- license: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [10]:
commits = session.read.json(f"{common_URI}commits.json")
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

In [11]:
git_commits= commits.select("commit", "subject", "message", "committer.date", "author.date")

In [12]:
newColumns = ["id","title","message","committer_date", "author_date"]
git_commits=git_commits.toDF(*newColumns)
git_commits.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- message: string (nullable = true)
 |-- committer_date: struct (nullable = true)
 |    |-- seconds: string (nullable = true)
 |-- author_date: struct (nullable = true)
 |    |-- seconds: string (nullable = true)



In [13]:
repositories = commits.select("repo_name")
repositories = repositories.withColumn("name", F.explode(repositories["repo_name"])).dropDuplicates(["name"]).select("name")

In [14]:
git_languages = languages.withColumn("name",F.explode(languages["language.name"])).select("name")
git_languages.printSchema()

root
 |-- name: string (nullable = true)



In [15]:
git_licenses = licences.select("license").withColumnRenamed("license","name")
git_licenses.printSchema()

root
 |-- name: string (nullable = true)



In [16]:
git_files = files.select("id","ref","path","mode","symlink_target")
git_files.printSchema()

root
 |-- id: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- path: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- symlink_target: string (nullable = true)



### Getting a mini-batch of the dataset

In [17]:
repositories_min = repositories.sort("name")
repositories_min = repositories_min.limit(100)
repositories_min.write.format("json").mode("overwrite").save(f"{common_URI}repositories_min.json")

In [18]:
commits_min = commits.join(repositories_min, F.array_contains(commits["repo_name"],repositories_min["name"])).select("commit","subject","message",commits["committer.date"].alias("comm_date"),commits["author.date"].alias("author_date"),"name")

In [19]:
commits_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}commits_min.json")

In [24]:
languages_min = languages.join(repositories_min, languages["repo_name"] == repositories_min["name"]).select("language","name")

In [25]:
languages_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}languages_min.json")

In [26]:
licenses_min = licences.join(repositories_min, licences["repo_name"] == repositories_min["name"]).select("license","name")

In [28]:
licenses_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}licenses_min.json")

In [None]:
files_min = files.join(repositories_min, files["repo_name"] == repositories_min["name"]).select("id","ref","path","mode","symlink_target","name")

In [None]:
files_min.write.format("json").mode(
    "overwrite").save(f"{common_URI}files_min.json")

### Writing the nodes in the graph

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"

spark_write(SparkConnector.NEO4J, git_languages, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"

spark_write(SparkConnector.NEO4J, git_licenses, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitFile"

spark_write(SparkConnector.NEO4J, git_files, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"

spark_write(SparkConnector.NEO4J, git_commits, "Overwrite", options=options)