In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F

In [13]:
from pyspark import pandas as ps
import pyspark.sql.types as T

In [3]:
session = create_spark_session("ArangoDB GitHub", SparkConnector.ARANGO)

Added dependencies: 
 ['arangodb-java-driver-shaded-7.1.0.jar', 'arangodb-spark-commons-3.3_2.12-1.5.0.jar', 'arangodb-spark-datasource-3.3_2.12-1.5.0.jar', 'commons-codec-1.11.jar', 'commons-logging-1.2.jar', 'httpclient-4.5.13.jar', 'httpcore-4.4.13.jar', 'jackson-dataformat-velocypack-4.1.0.jar', 'slf4j-api-2.0.7.jar']


In [4]:
common_URI = "hdfs://namenode:9000//data-team/"

In [16]:
commits = session.read.json(f"{common_URI}commits_min.json")
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    |-- old_path: string (nullable = true)
 |    |    

In [17]:
git_commits= commits.select("commit", "subject", "message")
newColumns = ["_key", "title", "message"]
git_commits = git_commits.toDF(*newColumns)
git_commits.printSchema()

root
 |-- _key: string (nullable = true)
 |-- title: string (nullable = true)
 |-- message: string (nullable = true)



In [18]:
repositories = session.read.json(f"{common_URI}repositories_min.json") # cambia se ti serve il dataset completo
repositories.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)



In [30]:
def remove_back(text):
    return text.replace("/", "::")

remove_udf = F.udf(remove_back, T.StringType())

git_repositories = repositories.withColumnRenamed("repo_name", "name")
git_repositories = git_repositories.withColumn("_key", remove_udf(git_repositories["name"])).drop("name")
git_repositories.show(10)

+-----------+--------------------+
|watch_count|                _key|
+-----------+--------------------+
|       4609|0x5e::wechat-dele...|
|       2574| 0xAX::linux-insides|
|       1983|         1000ch::grd|
|        659|  100apps::openshare|
|       1450|   10up::flexibility|
|        671|360Controller::36...|
|        818|   3lvis::Networking|
|       1131|500px::greedo-lay...|
|        753|500px::greedo-lay...|
|        880|81813780::AVLoadi...|
+-----------+--------------------+
only showing top 10 rows



In [20]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitCommit"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_commits, "Overwrite", options=options)

Dataframe saved to ARANGO


In [26]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "document"
options["table"] = "GitRepository"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, git_repositories, "Overwrite", options=options)

Dataframe saved to ARANGO


In [31]:
edges_df = commits.select("commit", "repo").withColumnRenamed("commit", "_from").withColumnRenamed("repo", "_to").withColumn("_to", remove_udf("_to"))
edges_pd_df = edges_df.to_pandas_on_spark()
edges_pd_df["_from"] = "GitCommit/" + edges_pd_df["_from"]
edges_pd_df["_to"] = "GitRepository/" + edges_pd_df["_to"]
spark_df = edges_pd_df.to_spark()
df = set_df_columns_nullable(session, spark_df, ["_from", "_to"], False)
df.printSchema()
df.show(10)


root
 |-- _from: string (nullable = false)
 |-- _to: string (nullable = false)

+--------------------+--------------------+
|               _from|                 _to|
+--------------------+--------------------+
|GitCommit/98da69b...|GitRepository/Aut...|
|GitCommit/35a6b40...|GitRepository/Fre...|
|GitCommit/c575b2d...|GitRepository/But...|
|GitCommit/6d5e386...|GitRepository/Fre...|
|GitCommit/dfd09c9...|GitRepository/Bro...|
|GitCommit/1457ff5...|GitRepository/Aut...|
|GitCommit/4eb66ac...|GitRepository/Bla...|
|GitCommit/2271758...|GitRepository/Cos...|
|GitCommit/38d71ab...|GitRepository/Fre...|
|GitCommit/c060524...|GitRepository/Fre...|
+--------------------+--------------------+
only showing top 10 rows



In [33]:
options = get_default_options(SparkConnector.ARANGO)
options["table.type"] = "edge"
options["table"] = "BELONGS_TO"
options["createCollection"] = "true"

spark_write(SparkConnector.ARANGO, df, "Overwrite", options=options)

Dataframe saved to ARANGO
