## Importing library and creating spark-session

In [None]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [None]:
session = create_spark_session("Neo4j GitHub", SparkConnector.NEO4J)

In [None]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # "sample_" or ""

### Reading from HDFS

In [None]:
repositories = session.read.json(f"{HDFS_URL}/{PREFIX}repositories.json") # cambia se ti serve il dataset completo
repositories.printSchema()

In [None]:
languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages.json")
languages.printSchema()

In [None]:
licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences.json")
licences.printSchema()

In [None]:
files = session.read.json(f"{HDFS_URL}/{PREFIX}files.json")
files.printSchema()

In [None]:
commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits.json") # cambia se ti serve il dataset completo
commits.printSchema()

## Data Processing

#### Nodes

In [None]:
git_commits = commits.select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["id","title","message","committer_date","author_date"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

In [None]:
git_repositories = repositories.withColumnRenamed("repo_name", "name")
git_repositories.printSchema()

In [None]:
git_languages = languages.withColumn("name", F.explode(languages["language.name"])).dropDuplicates(["name"]).select("name")
git_languages.printSchema()

In [None]:
git_licenses = licences.select("license").withColumnRenamed("license","name").dropDuplicates(["name"])
git_licenses.printSchema()

In [None]:
git_files = files.select("id","ref","path","mode","symlink_target")
git_files.printSchema()

In [None]:
git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"]) \
    .select("name","email")
git_contributor.printSchema()

#### Relations

In [None]:
belongs_to = commits.select("commit","repo")
belongs_to.printSchema()

In [None]:
stays_in = files.select("id","repo_name") \
    .dropDuplicates(["id","repo_name"])
belongs_to.printSchema()

In [None]:
parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .dropDuplicates(["commit","parent"])
parent.printSchema()

In [None]:
has = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"])
has.printSchema()

In [None]:
author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("seconds","ts")
author = author \
    .filter(author["email"] != "") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])
author.printSchema()

In [None]:
committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["email"] != "") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["email","commit"])
committed.printSchema()

In [None]:
writted_in = languages.withColumn("lang", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("language", writted_in["lang.name"]) \
    .withColumn("bytes", writted_in["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes")

writted_in.printSchema()

## Writing the nodes in the graph

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitRepository"

spark_write(SparkConnector.NEO4J, git_repositories, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "email"
options["labels"] = ":GitContributor"

spark_write(SparkConnector.NEO4J, git_contributor, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLanguage"

spark_write(SparkConnector.NEO4J, git_languages, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "name"
options["labels"] = ":GitLicense"

spark_write(SparkConnector.NEO4J, git_licenses, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitFile"

spark_write(SparkConnector.NEO4J, git_files, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["node.keys"] = "id"
options["labels"] = ":GitCommit"

spark_write(SparkConnector.NEO4J, git_commits, "Append", options=options)

### Indices creation

In [None]:
""" 
CREATE INDEX commits FOR (c:GitCommit) ON (c.id);
CREATE INDEX repositories FOR (r:GitRepository) ON (r.name);
CREATE INDEX languages FOR (l:GitLanguage) ON (l.name);
CREATE INDEX licenses FOR (l:GitLicense) ON (l.name);
CREATE INDEX files FOR (f:GitFile) ON (f.id);
CREATE INDEX contributors FOR (c:GitContributor) ON (c.email);
"""

raise Exception("Stop here and run the query below in Neo4j Browser")

### Writing the relationships in the graph

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (c:GitCommit {id: event.commit}), (r:GitRepository {name: event.repo})
    MERGE (c)-[:BELONGS_TO]->(r)-[:CONTAINS]->(c)"""

spark_write(SparkConnector.NEO4J, belongs_to, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "STAYS_IN"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitFile"
options["relationship.source.node.keys"] = "id:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitRepository"
options["relationship.target.node.keys"] = "repo_name:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, stays_in, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "PARENT"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitCommit"
options["relationship.source.node.keys"] = "commit:id"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitCommit"
options["relationship.target.node.keys"] = "parent:id"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, parent, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "HAS"
options["relationship.save.strategy"] = "keys"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLicense"
options["relationship.target.node.keys"] = "license:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, has, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (a:GitContributor {email: event.email}), (c:GitCommit {id: event.commit})
    MERGE (a)-[:AUTHOR]->(c)-[:MADE_BY]->(a)"""

spark_write(SparkConnector.NEO4J, author, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["query"] = """ 
    MATCH (a:GitContributor {email: event.email}), (c:GitCommit {id: event.commit})
    MERGE (a)-[:COMMITTED]->(c)-[:COMMITTED_BY]->(a)"""

spark_write(SparkConnector.NEO4J, committed, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.NEO4J)

options["relationship"] = "WRITTED_IN"
options["relationship.save.strategy"] = "keys"
options["relationship.bytes"] = "bytes"

options["relationship.source.labels"] = "GitRepository"
options["relationship.source.node.keys"] = "repo_name:name"
options["relationship.source.save.mode"] = "Match"

options["relationship.target.labels"] = "GitLanguage"
options["relationship.target.node.keys"] = "language:name"
options["relationship.target.save.mode"] = "Match"

spark_write(SparkConnector.NEO4J, writted_in, "Append", options=options)

## Stop spark context and spark session

In [None]:
session.sparkContext.stop()
session.stop()