## Importing library and creating spark-session

In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

Added dependencies: 
 ['neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar']


In [3]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # "sample_" or ""

### Reading from HDFS

In [4]:
repositories_json = session.read.json(f"{HDFS_URL}/{PREFIX}repositories.json") # cambia se ti serve il dataset completo
repositories_json.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)



In [5]:
repositories_csv = session.read.csv(f"{HDFS_URL}/{PREFIX}repo_API_data.csv", header=True, inferSchema=True)
repositories_csv = repositories_csv.select("repo_name","stargazers_count","topics")
repositories_csv.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- stargazers_count: integer (nullable = true)
 |-- topics: string (nullable = true)



In [6]:
repositories = repositories_json.join(repositories_csv, repositories_json.repo_name == repositories_csv.repo_name, "left")
repositories = repositories.drop(repositories_csv.repo_name)
repositories.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)
 |-- stargazers_count: integer (nullable = true)
 |-- topics: string (nullable = true)



In [7]:
languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages.json")
languages.printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bytes: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [8]:
licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences.json")
licences.printSchema()

root
 |-- license: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [9]:
commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits.json") # cambia se ti serve il dataset completo
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

## Data Processing

In [10]:
def remove_not_allowed(txt):
    txt = txt.replace("/", "::")
    txt = re.sub(r"[^a-zA-Z0-9_\-.@()+,=;$!*'%:]", "", txt)
    txt = txt.replace("@", "::")
    return txt

def remove_c_sharp(text):
    return text.replace("#", "s").replace(" ", "").replace("++", "pp")

remove_c_sharp = F.udf(remove_c_sharp, T.StringType())
# used after

remove_udf = F.udf(remove_not_allowed, T.StringType())
repositories = repositories.withColumn("repo_name", remove_udf("repo_name"))
commits = commits.withColumn("repo", remove_udf("repo"))
commits = commits.withColumn("author_email", remove_udf("author.email"))
commits = commits.withColumn("committer_email", remove_udf("committer.email"))
licences = licences.withColumn("repo_name", remove_udf("repo_name"))
languages = languages.withColumn("repo_name", remove_udf("repo_name"))

#### Nodes

In [11]:
git_commits = commits.select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["id","title","message","committer_date","author_date"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- message: string (nullable = true)
 |-- committer_date: string (nullable = true)
 |-- author_date: string (nullable = true)



In [12]:
git_repositories = repositories.withColumnRenamed("repo_name", "name")
git_repositories.printSchema()

root
 |-- name: string (nullable = true)
 |-- watch_count: string (nullable = true)
 |-- stargazers_count: integer (nullable = true)
 |-- topics: string (nullable = true)



In [13]:
git_languages = languages.withColumn("name", F.explode(languages["language.name"])).dropDuplicates(["name"]).select("name")
git_languages.printSchema()

root
 |-- name: string (nullable = true)



In [14]:
git_licenses = licences.select("license").withColumnRenamed("license","name").dropDuplicates(["name"])
git_licenses.printSchema()

root
 |-- name: string (nullable = true)



In [15]:
git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"]) \
    .select("name","email")
git_contributor.printSchema()

root
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)



#### Relations

In [16]:
belongs_to = commits.select("commit","repo")
belongs_to.printSchema()

root
 |-- commit: string (nullable = true)
 |-- repo: string (nullable = true)



In [17]:
parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .dropDuplicates(["commit","parent"])
parent.printSchema()

root
 |-- commit: string (nullable = true)
 |-- parent: string (nullable = true)



In [18]:
has = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"])
has.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- license: string (nullable = true)



In [19]:
author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("seconds","ts")
author = author \
    .filter(author["email"] != "") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])
author.printSchema()

root
 |-- email: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- ts: integer (nullable = true)



In [20]:
committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["email"] != "") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])
committed.printSchema()

root
 |-- email: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- ts: integer (nullable = true)



In [21]:
writted_in = languages.withColumn("lang", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("language", writted_in["lang.name"]) \
    .withColumn("bytes", writted_in["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes")

writted_in.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- language: string (nullable = true)
 |-- bytes: integer (nullable = true)



## Writing the nodes in the graph

In [22]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_repositories, "Overwrite", options=options)

Py4JJavaError: An error occurred while calling o273.save.
: org.neo4j.driver.exceptions.ServiceUnavailableException: Unable to connect to neo4j:7687, ensure the database is running and that there is a working network connection to it.
	at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:111)
	at org.neo4j.driver.internal.InternalSession.run(InternalSession.java:62)
	at org.neo4j.driver.internal.InternalSession.run(InternalSession.java:47)
	at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:34)
	at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:49)
	at org.neo4j.spark.util.ValidateConnection.validate(Validations.scala:75)
	at org.neo4j.spark.util.ValidateWrite.validate(Validations.scala:110)
	at org.neo4j.spark.util.Validations$.$anonfun$validate$1(Validations.scala:12)
	at org.neo4j.spark.util.Validations$.$anonfun$validate$1$adapted(Validations.scala:12)
	at scala.collection.immutable.Set$Set1.foreach(Set.scala:141)
	at org.neo4j.spark.util.Validations$.validate(Validations.scala:12)
	at org.neo4j.spark.writer.Neo4jWriterBuilder.validOptions(Neo4jWriterBuilder.scala:19)
	at org.neo4j.spark.writer.Neo4jWriterBuilder.buildForBatch(Neo4jWriterBuilder.scala:31)
	at org.apache.spark.sql.connector.write.WriteBuilder$1.toBatch(WriteBuilder.java:44)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run(WriteToDataSourceV2Exec.scala:332)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run$(WriteToDataSourceV2Exec.scala:331)
	at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.run(WriteToDataSourceV2Exec.scala:262)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:318)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
	Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
		at org.neo4j.driver.internal.async.connection.ChannelConnectedListener.databaseUnavailableError(ChannelConnectedListener.java:72)
		at org.neo4j.driver.internal.async.connection.ChannelConnectedListener.operationComplete(ChannelConnectedListener.java:66)
		at org.neo4j.driver.internal.async.connection.ChannelConnectedListener.operationComplete(ChannelConnectedListener.java:36)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:583)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:559)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:629)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setFailure(DefaultPromise.java:110)
		at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPromise.setFailure(DefaultChannelPromise.java:89)
		at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:214)
		at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:46)
		at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:180)
		at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:166)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:557)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:625)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:105)
		at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:990)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:516)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:429)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:486)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:174)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:167)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:470)
		at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:569)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
		at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
		... 1 more
Caused by: java.net.UnknownHostException: neo4j
	at java.base/java.net.InetAddress$CachedAddresses.get(InetAddress.java:801)
	at java.base/java.net.InetAddress.getAllByName0(InetAddress.java:1519)
	at java.base/java.net.InetAddress.getAllByName(InetAddress.java:1377)
	at java.base/java.net.InetAddress.getAllByName(InetAddress.java:1305)
	at org.neo4j.driver.internal.DefaultDomainNameResolver.resolve(DefaultDomainNameResolver.java:35)
	at org.neo4j.driver.internal.async.connection.NettyDomainNameResolver.doResolve(NettyDomainNameResolver.java:41)
	at org.neo4j.driver.internal.shaded.io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:61)
	at org.neo4j.driver.internal.shaded.io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:53)
	at org.neo4j.driver.internal.shaded.io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:55)
	at org.neo4j.driver.internal.shaded.io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:31)
	at org.neo4j.driver.internal.shaded.io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:106)
	at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:206)
	at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:46)
	at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:180)
	at org.neo4j.driver.internal.shaded.io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:166)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:557)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:625)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:105)
	at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
	at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:990)
	at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:516)
	at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:429)
	at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:486)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:174)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:167)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:470)
	at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:569)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)
	at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	... 1 more


In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "email"
options["labels"] = ":GitContributor"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_contributor, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_languages, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_licenses, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_commits, "Overwrite", options=options)

### Writing the relationships in the graph

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (c:GitCommit {id: event.commit}), (r:GitRepository {name: event.repo})
    MERGE (c)-[:BELONGS_TO]->(r)-[:CONTAINS]->(c)"""

spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "PARENT"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "parent:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, parent, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "HAS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLicense"
options["relationship.target.node.keys"] = "license:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, has, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (a:GitContributor {email: event.email}), (c:GitCommit {id: event.commit})
    MERGE (a)-[:AUTHOR{ts: event.ts}]->(c)"""

spark_write(SparkConnector.NEO4J, author, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (a:GitContributor {email: event.email}), (c:GitCommit {id: event.commit})
    MERGE (a)-[:COMMITTED{ts: event.ts}]->(c)"""

spark_write(SparkConnector.NEO4J, committed, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "WRITTED_IN"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "bytes"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLanguage"
options["relationship.target.node.keys"] = "language:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, writted_in, "Append", options=options)

## Stop spark context and spark session

In [None]:
session.sparkContext.stop()
session.stop()