In [10]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [11]:
from pyspark import pandas as ps
import pyspark.sql.types as T

In [12]:
session = create_spark_session("ArangoDB GitHub", SparkConnector.ARANGO)

Added dependencies: 
 ['arangodb-java-driver-shaded-7.1.0.jar', 'arangodb-spark-commons-3.3_2.12-1.5.0.jar', 'arangodb-spark-datasource-3.3_2.12-1.5.0.jar', 'commons-codec-1.11.jar', 'commons-logging-1.2.jar', 'httpclient-4.5.13.jar', 'httpcore-4.4.13.jar', 'jackson-dataformat-velocypack-4.1.0.jar', 'slf4j-api-2.0.7.jar']


In [13]:
common_URI = "hdfs://namenode:9000//data-team"
prefix = "sample_"

### Reading from the HDFS

In [14]:
# togli prefix se ti serve il dataset completo
languages = session.read.json(f"{common_URI}/{prefix}languages.json")
languages.printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bytes: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [15]:
# togli prefix se ti serve il dataset completo
repositories = session.read.json(f"{common_URI}/{prefix}repositories.json")
repositories.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)



In [16]:
licences = session.read.json(f"{common_URI}/{prefix}licences.json")
licences.printSchema()

root
 |-- license: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [17]:
files = session.read.json(f"{common_URI}/{prefix}files.json")
files.printSchema()

root
 |-- id: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- path: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- symlink_target: string (nullable = true)



In [18]:
commits = session.read.json(f"{common_URI}/{prefix}commits.json")
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

### Data Pre-processing

In [19]:
def remove_back(text):
    return text.replace("/", "::")

remove_udf = F.udf(remove_back, T.StringType())
repositories = repositories.withColumn("repo_name", remove_udf("repo_name"))
files = files.withColumn("repo_name", remove_udf("repo_name"))
commits = commits.withColumn("repo", remove_udf("repo"))
licences = licences.withColumn("repo_name", remove_udf("repo_name"))
languages = languages.withColumn("repo_name", remove_udf("repo_name"))

In [20]:
def remove_at_and_blank(text):
    return text.replace("@", "::").replace(" ", "")

remove_at_and_blank = F.udf(remove_at_and_blank, T.StringType())
commits = commits.withColumn("author_email", remove_at_and_blank("author.email"))
commits = commits.withColumn("committer_email", remove_at_and_blank("committer.email"))

### Data Processing

In [21]:
git_commits= commits.select("commit", "subject", "message")
newColumns = ["_key", "title", "message"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

root
 |-- _key: string (nullable = true)
 |-- title: string (nullable = true)
 |-- message: string (nullable = true)



In [22]:
git_repositories = repositories.withColumnRenamed("repo_name", "_key")
git_repositories.show(10)

+--------------------+-----------+
|                _key|watch_count|
+--------------------+-----------+
|FreeCodeCamp::Fre...|      90457|
|    firehol::netdata|      13208|
|    joshbuchea::HEAD|      13125|
|braydie::HowToBeA...|      12019|
|sindresorhus::awe...|      11063|
|tensorflow::tenso...|      10728|
|     facebook::react|      10458|
|ParsePlatform::pa...|      10339|
|  loverajoel::jstips|       9585|
|facebook::react-n...|       9437|
+--------------------+-----------+



In [23]:
def remove_c_sharp(text):
    return text.replace("#", "s").replace(" ", "").replace("++", "pp")

remove_c_sharp = F.udf(remove_c_sharp, T.StringType())

git_languages = languages.withColumn("name", F.explode(languages["language.name"]))\
    .dropDuplicates(["name"])\
    .select("name")\
    .withColumnRenamed("name", "_key")

git_languages = git_languages.withColumn("_key", remove_c_sharp(git_languages["_key"]))
git_languages.printSchema()

root
 |-- _key: string (nullable = true)



In [24]:
git_licenses = licences.select("license").withColumnRenamed(
    "license", "name").dropDuplicates(["name"]).withColumnRenamed("name", "_key")
git_licenses.printSchema()

root
 |-- _key: string (nullable = true)



In [25]:
git_files = files.select("id", "ref", "path", "mode", "symlink_target").withColumnRenamed("id", "_key")
git_files.printSchema()

root
 |-- _key: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- path: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- symlink_target: string (nullable = true)



In [26]:
git_contributor = commits.select("author.name",commits["author_email"].alias("email")) \
    .union(commits.select("committer.name", commits["committer_email"].alias("email"))) \
    .dropDuplicates(["name"]) \
    .select("name", "email")\
    .withColumnRenamed("email", "_key")

git_contributor = git_contributor.filter(git_contributor["_key"] != "")
git_contributor.printSchema()

root
 |-- name: string (nullable = true)
 |-- _key: string (nullable = true)



### Writing the nodes in the graph

In [27]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitCommit"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_commits, "Overwrite", options=options)

Dataframe saved to ARANGO


In [28]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitRepository"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_repositories, "Overwrite", options=options)

Dataframe saved to ARANGO


In [29]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitContributor"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_contributor, "Overwrite", options=options)

Dataframe saved to ARANGO


In [30]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitLanguage"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_languages, "Overwrite", options=options)

Dataframe saved to ARANGO


In [31]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitLicense"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_licenses, "Overwrite", options=options)

Dataframe saved to ARANGO


In [32]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitFile"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_files, "Overwrite", options=options)

Dataframe saved to ARANGO


### Writing the relationships in the graph

In [33]:
edges_df = commits.select("commit", "repo")\
                .withColumnRenamed("commit", "_from")\
                .withColumnRenamed("repo", "_to")\
                .withColumn("_to", remove_udf("_to"))
edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitCommit/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitRepository/" + edges_pd_df["_to"]
belongs_to_df = edges_pd_df.to_spark()
belongs_to_df = set_df_columns_nullable(session, belongs_to_df, ["_from", "_to"], False)
belongs_to_df.printSchema()



root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)





In [34]:
edges_df = files.select("id", "repo_name")\
    .withColumnRenamed("id", "_from")\
    .withColumnRenamed("repo_name", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitFile/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitRepository/" + edges_pd_df["_to"]

stays_in_df = edges_pd_df.to_spark()
stays_in_df = set_df_columns_nullable(session, stays_in_df, ["_from", "_to"], False)
stays_in_df.printSchema()

root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)



In [35]:
edges_df = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .withColumnRenamed("commit", "_from")\
    .withColumnRenamed("parent", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitCommit/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]

parent_df = edges_pd_df.to_spark()
parent_df = set_df_columns_nullable(session, parent_df, ["_from", "_to"], False)
parent_df.printSchema()

root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)



In [36]:
edges_df = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"]) \
    .withColumnRenamed("repo_name", "_from")\
    .withColumnRenamed("license", "_to")\
    .dropDuplicates(["_from", "_to"])
    
edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitRepository/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitLicense/" + edges_pd_df["_to"]


has_df = edges_pd_df.to_spark()
has_df = set_df_columns_nullable(session, has_df, ["_from", "_to"], False)
has_df.printSchema()

root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)



In [37]:
edges_df = commits.select(commits["author_email"].alias("email"), "commit", "author.date.seconds") \
    .withColumnRenamed("seconds", "ts")
edges_df = edges_df \
    .filter(edges_df["email"] != "") \
    .withColumn("ts", edges_df["ts"].cast(T.IntegerType())) \
    .withColumnRenamed("email", "_from")\
    .withColumnRenamed("commit", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitContributor/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]


author_df = edges_pd_df.to_spark()
author_df = set_df_columns_nullable(session, author_df, ["_from", "_to"], False)
author_df.printSchema()

root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)
 |-- ts: integer (nullable = true)



In [38]:
edges_df = commits.select(commits["committer_email"].alias("email"), "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds", "ts")
edges_df = edges_df \
    .filter(edges_df["email"] != "") \
    .withColumn("ts", edges_df["ts"].cast(T.IntegerType())) \
    .withColumnRenamed("email", "_from")\
    .withColumnRenamed("commit", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitContributor/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]


committed_df = edges_pd_df.to_spark()
committed_df = set_df_columns_nullable(session, committed_df, ["_from", "_to"], False)
committed_df.printSchema()

root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)
 |-- ts: integer (nullable = true)



In [49]:
edges_df = languages.withColumn("lang", F.explode(languages["language"]))
edges_df = edges_df \
    .withColumn("language", edges_df["lang.name"]) \
    .withColumn("bytes", edges_df["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes") \
    .withColumnRenamed("repo_name", "_from")\
    .withColumnRenamed("language", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_df = edges_df.withColumn("_to", remove_c_sharp(edges_df["_to"]))

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitRepository/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitLanguage/" + edges_pd_df["_to"]


writted_in_df = edges_pd_df.to_spark()
writted_in_df = set_df_columns_nullable(session, writted_in_df, ["_from", "_to"], False)
writted_in_df.printSchema()

root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)
 |-- bytes: integer (nullable = true)



In [40]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "BELONGS_TO"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, belongs_to_df, "Overwrite", options=options)

Dataframe saved to ARANGO


In [41]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "STAYS_IN"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, stays_in_df, "Overwrite", options=options)

Dataframe saved to ARANGO


In [42]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "PARENT"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, parent_df, "Overwrite", options=options)

Dataframe saved to ARANGO


In [43]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "HAS"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, has_df, "Overwrite", options=options)

Dataframe saved to ARANGO


In [44]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "AUTHOR"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, author_df, "Append", options=options)

Dataframe saved to ARANGO


In [45]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "COMMITTED"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, committed_df, "Overwrite", options=options)

Dataframe saved to ARANGO


In [51]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "WRITTEN_IN"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, writted_in_df, "Overwrite", options=options)

Dataframe saved to ARANGO


### Stop spark context and spark session

In [None]:
session.sparkContext.stop()
session.stop()