## Importing library and creating spark-session

In [2]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [3]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

Added dependencies: 
 ['neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar']


In [4]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # or ""

### Reading from HDFS

In [None]:
repositories = session.read.json(f"{HDFS_URL}/{PREFIX}repositories.json") # cambia se ti serve il dataset completo
repositories.printSchema()

In [None]:
languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages.json")
languages.printSchema()

In [None]:
licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences.json")
licences.printSchema()

In [None]:
files = session.read.json(f"{HDFS_URL}/{PREFIX}files.json")
files.printSchema()

In [5]:
commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits.json") # cambia se ti serve il dataset completo
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

## Data Processing

In [None]:
git_commits = commits.select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["id","title","message","committer_date", "author_date"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

In [None]:
git_repositories = repositories.withColumnRenamed("repo_name", "name")
git_repositories.printSchema()

In [None]:
git_languages = languages.withColumn("name", F.explode(languages["language.name"])).dropDuplicates(["name"]).select("name")
git_languages.printSchema()

In [None]:
git_licenses = licences.select("license").withColumnRenamed("license","name").dropDuplicates(["name"])
git_licenses.printSchema()

In [None]:
git_files = files.select("id","ref","path","mode","symlink_target")
git_files.printSchema()

In [None]:
git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["name"]) \
    .select("name","email")
git_contributor.printSchema()

## Writing the nodes in the graph

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"

spark_write(SparkConnector.NEO4J, git_repositories, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "email"
options["labels"] = ":GitContributor"

spark_write(SparkConnector.NEO4J, git_contributor, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"

spark_write(SparkConnector.NEO4J, git_languages, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"

spark_write(SparkConnector.NEO4J, git_licenses, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitFile"

spark_write(SparkConnector.NEO4J, git_files, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"

spark_write(SparkConnector.NEO4J, git_commits, "Append", options=options)

### Writing the relationships in the graph

In [6]:
belongs_to = commits.select("commit","repo").limit(1000)
belongs_to.show(10)

+--------------------+--------------------+
|              commit|                repo|
+--------------------+--------------------+
|ad3a65642b24879f6...|         apple/swift|
|f8fa982c8480e697d...|         apple/swift|
|1bf9fe6098de5fc30...|      matryer/bitbar|
|62263967b91ca841c...|       docker/docker|
|ee77c87994ea2dd62...|callemall/materia...|
|eea514412bfadf123...|         apple/swift|
|1bd115c4fa20b53e2...|kadirahq/react-st...|
|ff3885dc11caddd98...|           golang/go|
|b7760e645d56e52f1...|    Microsoft/vscode|
|8065ccf9fdac83ce9...|         apple/swift|
+--------------------+--------------------+
only showing top 10 rows



In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "BELONGS_TO"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"

options["relationship.target.labels"] = "GitRepository"
options["relationship.target.node.keys"] = "repo:name"

spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

In [7]:
options = get_default_options(SparkConnector.NEO4J)
options["query"] = "MATCH (n :GitCommit), (m :GitRepository) \
                    WHERE (n.id = event.commit) AND (m.name = event.repo) \
                    CREATE (m)-[:BELONGS_TO]->(n)"

spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

## Stop spark context and spark session

In [8]:

session.sparkContext.stop()
session.stop()