## Importing library and creating spark-session

In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

Added dependencies: 
 ['neo4j-connector-apache-spark_2.12-5.0.1_for_spark_3.jar']


In [3]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # "sample_" or ""

### Reading from HDFS

In [4]:
repositories = session.read.json(f"{HDFS_URL}/{PREFIX}repositories.json") # cambia se ti serve il dataset completo
repositories.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)



In [5]:
languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages.json")
languages.printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bytes: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [6]:
licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences.json")
licences.printSchema()

root
 |-- license: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [8]:
commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits.json") # cambia se ti serve il dataset completo
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

## Data Processing

#### Nodes

In [9]:
git_commits = commits.select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["id","title","message","committer_date","author_date"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- message: string (nullable = true)
 |-- committer_date: string (nullable = true)
 |-- author_date: string (nullable = true)



In [10]:
git_repositories = repositories.withColumnRenamed("repo_name", "name")
git_repositories.printSchema()

root
 |-- name: string (nullable = true)
 |-- watch_count: string (nullable = true)



In [11]:
git_languages = languages.withColumn("name", F.explode(languages["language.name"])).dropDuplicates(["name"]).select("name")
git_languages.printSchema()

root
 |-- name: string (nullable = true)



In [12]:
git_licenses = licences.select("license").withColumnRenamed("license","name").dropDuplicates(["name"])
git_licenses.printSchema()

root
 |-- name: string (nullable = true)



In [14]:
git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"]) \
    .select("name","email")
git_contributor.printSchema()

root
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)



#### Relations

In [15]:
belongs_to = commits.select("commit","repo")
belongs_to.printSchema()

root
 |-- commit: string (nullable = true)
 |-- repo: string (nullable = true)



In [17]:
parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .dropDuplicates(["commit","parent"])
parent.printSchema()

root
 |-- commit: string (nullable = true)
 |-- parent: string (nullable = true)



In [18]:
has = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"])
has.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- license: string (nullable = true)



In [19]:
author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("seconds","ts")
author = author \
    .filter(author["email"] != "") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])
author.printSchema()

root
 |-- email: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- ts: integer (nullable = true)



In [20]:
committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["email"] != "") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])
committed.printSchema()

root
 |-- email: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- ts: integer (nullable = true)



In [21]:
writted_in = languages.withColumn("lang", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("language", writted_in["lang.name"]) \
    .withColumn("bytes", writted_in["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes")

writted_in.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- language: string (nullable = true)
 |-- bytes: integer (nullable = true)



## Writing the nodes in the graph

In [22]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_repositories, "Overwrite", options=options)

Dataframe saved to NEO4J


In [23]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "email"
options["labels"] = ":GitContributor"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_contributor, "Overwrite", options=options)

Dataframe saved to NEO4J


In [24]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_languages, "Overwrite", options=options)

Dataframe saved to NEO4J


In [25]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_licenses, "Overwrite", options=options)

Dataframe saved to NEO4J


In [27]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"
options["schema.optimization.type"] = "INDEX"

spark_write(SparkConnector.NEO4J, git_commits, "Overwrite", options=options)

Dataframe saved to NEO4J


### Writing the relationships in the graph

In [28]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (c:GitCommit {id: event.commit}), (r:GitRepository {name: event.repo})
    MERGE (c)-[:BELONGS_TO]->(r)-[:CONTAINS]->(c)"""

spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

Dataframe saved to NEO4J


In [30]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "PARENT"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "parent:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, parent, "Append", options=options)

Dataframe saved to NEO4J


In [31]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "HAS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLicense"
options["relationship.target.node.keys"] = "license:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, has, "Append", options=options)

Dataframe saved to NEO4J


In [33]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (a:GitContributor {email: event.email}), (c:GitCommit {id: event.commit})
    MERGE (a)-[:AUTHOR{ts: event.ts}]->(c)"""

spark_write(SparkConnector.NEO4J, author, "Append", options=options)

Dataframe saved to NEO4J


In [34]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (a:GitContributor {email: event.email}), (c:GitCommit {id: event.commit})
    MERGE (a)-[:COMMITTED{ts: event.ts}]->(c)"""

spark_write(SparkConnector.NEO4J, committed, "Append", options=options)

Dataframe saved to NEO4J


In [35]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "WRITTED_IN"
options["relationship.save.strategy"] = "keys"
options["relationship.properties"] = "bytes"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLanguage"
options["relationship.target.node.keys"] = "language:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, writted_in, "Append", options=options)

Dataframe saved to NEO4J


## Stop spark context and spark session

In [36]:
session.sparkContext.stop()
session.stop()