## Importing library and creating spark-session

In [1]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [2]:
session = create_spark_session("Tigergraph GitHub", SparkConnector.TIGERGRAPH)

Added dependencies: 
 ['postgresql-42.5.0.jar', 'tigergraph-jdbc-driver-1.3.6.jar']


In [3]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # or ""

### Reading from HDFS

In [4]:
repositories = session.read.json(f"{HDFS_URL}/{PREFIX}repositories.json") # cambia se ti serve il dataset completo
repositories.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)



In [5]:
languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages.json")
languages.printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bytes: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [6]:
licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences.json")
licences.printSchema()

root
 |-- license: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [7]:
files = session.read.json(f"{HDFS_URL}/{PREFIX}files.json")
files.printSchema()

root
 |-- id: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- path: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- symlink_target: string (nullable = true)



In [8]:
commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits.json") # cambia se ti serve il dataset completo
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

## Data Processing

In [9]:
git_commits = commits.select("commit", "subject", "message", "committer.date.seconds", "author.date.seconds")

newColumns = ["v_id","title","message","committer_date", "author_date"]
git_commits = git_commits.toDF(*newColumns)

git_commits.printSchema()

root
 |-- v_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- message: string (nullable = true)
 |-- committer_date: string (nullable = true)
 |-- author_date: string (nullable = true)



In [10]:
git_repositories = repositories.withColumnRenamed("repo_name", "v_id")
git_repositories = git_repositories.withColumn("watch_count", git_repositories["watch_count"].cast(T.IntegerType()))
git_repositories.printSchema()

root
 |-- v_id: string (nullable = true)
 |-- watch_count: integer (nullable = true)



In [11]:
git_languages = languages.withColumn("v_id", F.explode(languages["language.name"])).dropDuplicates(["v_id"]).select("v_id")
git_languages.printSchema()

root
 |-- v_id: string (nullable = true)



In [12]:
git_licenses = licences.select("license").withColumnRenamed("license","v_id").dropDuplicates(["v_id"])
git_licenses.printSchema()

root
 |-- v_id: string (nullable = true)



In [13]:
git_files = files.select("id","ref","path","mode","symlink_target")
git_files = git_files.withColumnRenamed("id","v_id") \
    .withColumnRenamed("ref","reference") \
    .withColumn("mode", git_files["mode"].cast(T.IntegerType()))

git_files.printSchema()

root
 |-- v_id: string (nullable = true)
 |-- reference: string (nullable = true)
 |-- path: string (nullable = true)
 |-- mode: integer (nullable = true)
 |-- symlink_target: string (nullable = true)



In [21]:
git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"])
git_contributor = git_contributor.select("email","name") \
    .filter(git_contributor["email"]!="") \
    .withColumnRenamed("email","v_id")
git_contributor.printSchema()

root
 |-- v_id: string (nullable = true)
 |-- name: string (nullable = true)



## Writing the nodes in the graph

In [15]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitRepository"

spark_write(SparkConnector.TIGERGRAPH, git_repositories, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [22]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitContributor"

spark_write(SparkConnector.TIGERGRAPH, git_contributor, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [23]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitLanguage"

spark_write(SparkConnector.TIGERGRAPH, git_languages, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [24]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitLicense"

spark_write(SparkConnector.TIGERGRAPH, git_licenses, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [25]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitFile"

spark_write(SparkConnector.TIGERGRAPH, git_files, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [26]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitCommit"

spark_write(SparkConnector.TIGERGRAPH, git_commits, "Append", options=options)

Dataframe saved to TIGERGRAPH


## Writing the relationships in the graph

In [27]:
belongs_to = commits.select("commit","repo") \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("repo","GitRepository")
belongs_to.printSchema()

root
 |-- GitCommit: string (nullable = true)
 |-- GitRepository: string (nullable = true)



In [28]:
stays_in = files.select("id","repo_name") \
    .withColumnRenamed("id","GitFile") \
    .withColumnRenamed("repo_name","GitRepository") \
    .dropDuplicates(["GitFile","GitRepository"])
belongs_to.printSchema()

root
 |-- GitCommit: string (nullable = true)
 |-- GitRepository: string (nullable = true)



In [29]:
parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("parent","GitCommit")\
    .dropDuplicates(["GitCommit","GitCommit"])
parent.printSchema()

root
 |-- GitCommit: string (nullable = true)
 |-- GitCommit: string (nullable = true)



In [32]:
stays_in = files.select("id","repo_name") \
    .withColumnRenamed("id","GitFile") \
    .withColumnRenamed("repo_name","GitRepository") \
    .dropDuplicates(["GitFile","GitRepository"])
stays_in.printSchema()

root
 |-- GitFile: string (nullable = true)
 |-- GitRepository: string (nullable = true)



In [33]:
has = licences.select("repo_name", "license") \
    .withColumnRenamed("repo_name","GitRepository") \
    .withColumnRenamed("license","GitLicense")\
    .dropDuplicates(["GitRepository","GitLicense"])
has.printSchema()

root
 |-- GitRepository: string (nullable = true)
 |-- GitLicense: string (nullable = true)



In [45]:
author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("email","GitContributor") \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("seconds","ts") 
author = author \
    .filter(author["GitContributor"]!="") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["GitContributor","GitCommit"])
author.printSchema()

root
 |-- GitContributor: string (nullable = true)
 |-- GitCommit: string (nullable = true)
 |-- ts: integer (nullable = true)



In [50]:
committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("email","GitContributor") \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["GitContributor"]!="") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["GitContributor","GitCommit"])
committed.printSchema()

root
 |-- GitContributor: string (nullable = true)
 |-- GitCommit: string (nullable = true)
 |-- ts: integer (nullable = true)



In [74]:
writted_in = languages.withColumn("language", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("GitLanguage", writted_in["language.name"]) \
    .withColumn("bytes", writted_in["language.bytes"].cast(T.IntegerType())) \
    .withColumnRenamed("repo_name", "GitRepository") \
    .select("GitRepository", "GitLanguage", "bytes")

writted_in.printSchema()

root
 |-- GitRepository: string (nullable = true)
 |-- GitLanguage: string (nullable = true)
 |-- bytes: integer (nullable = true)



## Writing relations between nodes in the graph

In [53]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge BELONGS_TO"

spark_write(SparkConnector.TIGERGRAPH, belongs_to, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [54]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge HAS"

spark_write(SparkConnector.TIGERGRAPH, has, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [55]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge STAYS_IN"

spark_write(SparkConnector.TIGERGRAPH, stays_in, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [None]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge PARENT"

spark_write(SparkConnector.TIGERGRAPH, parent, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [75]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge WRITTEN_IN"

spark_write(SparkConnector.TIGERGRAPH, writted_in, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [47]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge AUTHOR"

spark_write(SparkConnector.TIGERGRAPH, author, "Append", options=options)

Dataframe saved to TIGERGRAPH


In [52]:
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge COMMITED"

spark_write(SparkConnector.TIGERGRAPH, committed, "Append", options=options)

Dataframe saved to TIGERGRAPH


## Stop spark context and spark session

In [76]:
session.sparkContext.stop()
session.stop()