In [None]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [None]:
from pyspark import pandas as ps
import pyspark.sql.types as T

In [None]:
session = create_spark_session("ArangoDB GitHub", SparkConnector.ARANGO)

In [None]:
common_URI = "hdfs://namenode:9000//data-team"
prefix = "sample_"

### Reading from the HDFS

In [None]:
# togli prefix se ti serve il dataset completo
languages = session.read.json(f"{common_URI}/{prefix}languages.json")
languages.printSchema()

In [None]:
# togli prefix se ti serve il dataset completo
repositories_json = session.read.json(f"{common_URI}/{prefix}repositories.json")
repositories_json.printSchema()

In [None]:
repositories_csv = session.read.csv(f"{common_URI}/{prefix}repo_API_data.csv", header=True, inferSchema=True)
repositories_csv = repositories_csv.select("repo_name", "stargazers_count", "topics")
repositories_csv.printSchema()

In [None]:
repositories = repositories_json.join(
    repositories_csv, repositories_json.repo_name == repositories_csv.repo_name, "left")
repositories.printSchema()

In [None]:
licences = session.read.json(f"{common_URI}/{prefix}licences.json")
licences.printSchema()

In [None]:
commits = session.read.json(f"{common_URI}/{prefix}commits.json")
commits.printSchema()

### Data Pre-processing

In [None]:
def remove_back(text):
    return text.replace("/", "::")

remove_udf = F.udf(remove_back, T.StringType())
repositories = repositories.withColumn("repo_name", remove_udf("repo_name"))
commits = commits.withColumn("repo", remove_udf("repo"))
licences = licences.withColumn("repo_name", remove_udf("repo_name"))
languages = languages.withColumn("repo_name", remove_udf("repo_name"))

In [None]:
def remove_at_and_blank(text):
    return text.replace("@", "::").replace(" ", "")

remove_at_and_blank = F.udf(remove_at_and_blank, T.StringType())
commits = commits.withColumn("author_email", remove_at_and_blank("author.email"))
commits = commits.withColumn("committer_email", remove_at_and_blank("committer.email"))

### Data Processing

In [None]:
git_commits= commits.select("commit", "subject", "message")
newColumns = ["_key", "title", "message"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

In [None]:
git_repositories = repositories.withColumnRenamed("repo_name", "_key")
git_repositories.show(10)

In [None]:
def remove_c_sharp(text):
    return text.replace("#", "s").replace(" ", "").replace("++", "pp")

remove_c_sharp = F.udf(remove_c_sharp, T.StringType())

git_languages = languages.withColumn("name", F.explode(languages["language.name"]))\
    .dropDuplicates(["name"])\
    .select("name")\
    .withColumnRenamed("name", "_key")

git_languages = git_languages.withColumn("_key", remove_c_sharp(git_languages["_key"]))
git_languages.printSchema()

In [None]:
git_licenses = licences.select("license").withColumnRenamed(
    "license", "name").dropDuplicates(["name"]).withColumnRenamed("name", "_key")
git_licenses.printSchema()

In [None]:
git_contributor = commits.select("author.name",commits["author_email"].alias("email")) \
    .union(commits.select("committer.name", commits["committer_email"].alias("email"))) \
    .dropDuplicates(["name"]) \
    .select("name", "email")\
    .withColumnRenamed("email", "_key")

git_contributor = git_contributor.filter(git_contributor["_key"] != "")
git_contributor.printSchema()

### Writing the nodes in the graph

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitCommit"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_commits, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitRepository"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_repositories, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitContributor"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_contributor, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitLanguage"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_languages, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitLicense"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_licenses, "Overwrite", options=options)

### Writing the relationships in the graph

In [None]:
edges_df = commits.select("commit", "repo")\
                .withColumnRenamed("commit", "_from")\
                .withColumnRenamed("repo", "_to")\
                .withColumn("_to", remove_udf("_to"))
edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitCommit/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitRepository/" + edges_pd_df["_to"]
belongs_to_df = edges_pd_df.to_spark()
belongs_to_df = set_df_columns_nullable(session, belongs_to_df, ["_from", "_to"], False)
belongs_to_df.printSchema()

In [None]:
edges_df = commits.select("commit", "repo")\
    .withColumnRenamed("repo", "_from")\
    .withColumnRenamed("commit", "_to")\
    .withColumn("_from", remove_udf("_from"))
edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitRepository/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]
contains_df = edges_pd_df.to_spark()
contains_df = set_df_columns_nullable(session, contains_df, ["_from", "_to"], False)
contains_df.printSchema()

In [None]:
edges_df = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .withColumnRenamed("commit", "_from")\
    .withColumnRenamed("parent", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitCommit/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]

parent_df = edges_pd_df.to_spark()
parent_df = set_df_columns_nullable(session, parent_df, ["_from", "_to"], False)
parent_df.printSchema()

In [None]:
edges_df = licences.select("repo_name", "license") \
    .dropDuplicates(["repo_name", "license"]) \
    .withColumnRenamed("repo_name", "_from")\
    .withColumnRenamed("license", "_to")\
    .dropDuplicates(["_from", "_to"])
    
edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitRepository/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitLicense/" + edges_pd_df["_to"]


has_df = edges_pd_df.to_spark()
has_df = set_df_columns_nullable(session, has_df, ["_from", "_to"], False)
has_df.printSchema()

In [None]:
edges_df = commits.select(commits["author_email"].alias("email"), "commit", "author.date.seconds") \
    .withColumnRenamed("seconds", "ts")
edges_df = edges_df \
    .filter(edges_df["email"] != "") \
    .withColumn("ts", edges_df["ts"].cast(T.IntegerType())) \
    .withColumnRenamed("email", "_from")\
    .withColumnRenamed("commit", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitContributor/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]


author_df = edges_pd_df.to_spark()
author_df = set_df_columns_nullable(session, author_df, ["_from", "_to"], False)
author_df.printSchema()

In [None]:
edges_df = commits.select(commits["committer_email"].alias("email"), "commit", "committer.date.seconds") \
    .withColumnRenamed("seconds", "ts")
edges_df = edges_df \
    .filter(edges_df["email"] != "") \
    .withColumn("ts", edges_df["ts"].cast(T.IntegerType())) \
    .withColumnRenamed("email", "_from")\
    .withColumnRenamed("commit", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitContributor/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitCommit/" + edges_pd_df["_to"]


committed_df = edges_pd_df.to_spark()
committed_df = set_df_columns_nullable(session, committed_df, ["_from", "_to"], False)
committed_df.printSchema()

In [None]:
edges_df = languages.withColumn("lang", F.explode(languages["language"]))
edges_df = edges_df \
    .withColumn("language", edges_df["lang.name"]) \
    .withColumn("bytes", edges_df["lang.bytes"].cast(T.IntegerType())) \
    .select("repo_name", "language", "bytes") \
    .withColumnRenamed("repo_name", "_from")\
    .withColumnRenamed("language", "_to")\
    .dropDuplicates(["_from", "_to"])

edges_df = edges_df.withColumn("_to", remove_c_sharp(edges_df["_to"]))

edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitRepository/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitLanguage/" + edges_pd_df["_to"]


writted_in_df = edges_pd_df.to_spark()
writted_in_df = set_df_columns_nullable(session, writted_in_df, ["_from", "_to"], False)
writted_in_df.printSchema()

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "BELONGS_TO"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, belongs_to_df, "Overwrite", options=options)


In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "CONTAINS"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, contains_df, "Overwrite", options=options)


In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "PARENT"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, parent_df, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "HAS"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, has_df, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "AUTHOR"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, author_df, "Append", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "COMMITTED"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, committed_df, "Overwrite", options=options)

In [None]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "WRITTEN_IN"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, writted_in_df, "Overwrite", options=options)

### Stop spark context and spark session

In [None]:
session.sparkContext.stop()
session.stop()