## Importing library and creating spark-session

In [1]:
!pip install PyGithub

Collecting PyGithub
  Downloading PyGithub-1.59.1-py3-none-any.whl (342 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.2/342.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pynacl>=1.4.0
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting deprecated
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting types-cryptography>=3.3.21
  Downloading types_cryptography-3.3.23.2-py3-none-any.whl (30 kB)
Collecting wrapt<2,>=1.10
  Downloading wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.7/75.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing coll

In [9]:
import sys
fileDir = "/home/jovyan/notebooks/"
sys.path.append(fileDir)

from utilities import *
import pyspark.sql.functions as F
import pyspark.sql.types as T

import time as time

In [10]:
session = create_spark_session("Tigergraph GitHub", SparkConnector.TIGERGRAPH)

Added dependencies: 
 ['postgresql-42.5.0.jar', 'tigergraph-jdbc-driver-1.3.6.jar']


In [11]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_" # "sample_" or ""
SUFFIX = "_100" # "_10" or "_100" or ""

### Reading from HDFS

In [6]:
load_start_time = time.time()
repositories_json = session.read.json(f"{HDFS_URL}/{PREFIX}repositories{SUFFIX}.json") \
    .withColumnRenamed("repo_name", "repo") \

repositories_csv = session.read.csv(f"{HDFS_URL}/repo_API_data.csv", header=True, inferSchema=True)
repositories_csv = repositories_csv.select("repo_name","forks_count","open_issues_count","stargazers_count","topics")

repositories = repositories_json.join(repositories_csv, repositories_json.repo == repositories_csv.repo_name, "left") \
    .select(repositories_json["repo"].alias("repo_name"), 
            repositories_json["watch_count"], repositories_csv["stargazers_count"], 
            repositories_csv["topics"], repositories_csv["forks_count"], repositories_csv["open_issues_count"])
# set 0 as efault value for stargazers_count and forks_count
repositories = repositories.na.fill(0, ["stargazers_count", "watch_count", "topics", "forks_count", "open_issues_count"])

languages = session.read.json(f"{HDFS_URL}/{PREFIX}languages{SUFFIX}.json")

licences = session.read.json(f"{HDFS_URL}/{PREFIX}licences{SUFFIX}.json")

commits = session.read.json(f"{HDFS_URL}/{PREFIX}commits{SUFFIX}.json") # cambia se ti serve il dataset completo

load_end_time = time.time()
load_time = (load_end_time - load_start_time)

Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH
Dataframe saved to TIGERGRAPH


In [None]:

preprc_start_time = time.time()

git_commits = commits.select( 
    commits["commit"].alias("v_id"),
commits["subject"].alias("title"), 
"message")

git_repositories = repositories.withColumnRenamed("repo_name", "v_id")
git_repositories = git_repositories.withColumn("watch_count", git_repositories["watch_count"].cast(T.IntegerType())) \
    .withColumn("forks_count", git_repositories["forks_count"].cast(T.IntegerType())) \
    .withColumn("stargazers_count", git_repositories["stargazers_count"].cast(T.IntegerType())) \
    .withColumn("open_issues_count", git_repositories["open_issues_count"].cast(T.IntegerType())) 

git_languages = languages.withColumn("v_id", F.explode(languages["language.name"])).dropDuplicates(["v_id"]).select("v_id")

git_licenses = licences.select("license").withColumnRenamed("license","v_id").dropDuplicates(["v_id"])

git_contributor = commits.select("author.*") \
    .union(commits.select("committer.*")) \
    .dropDuplicates(["email"])
git_contributor = git_contributor.select("email","name") \
    .filter(git_contributor["email"]!="") \
    .withColumnRenamed("email","v_id")

belongs_to = commits.select("commit","repo") \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("repo","GitRepository")

contains = belongs_to.select("GitRepository","GitCommit")

parent = commits.select("commit", "parent") \
    .withColumn("parent", F.explode(commits["parent"])) \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("parent","GitCommit")\
    .dropDuplicates(["GitCommit","GitCommit"])

has = licences.select("repo_name", "license") \
    .withColumnRenamed("repo_name","GitRepository") \
    .withColumnRenamed("license","GitLicense")\
    .dropDuplicates(["GitRepository","GitLicense"])

author = commits.select("author.email", "commit", "author.date.seconds") \
    .withColumnRenamed("email","GitContributor") \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("seconds","ts") 
author = author \
    .filter(author["GitContributor"]!="") \
    .withColumn("ts", author["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["GitContributor","GitCommit"])

committed = commits.select("committer.email", "commit", "committer.date.seconds") \
    .withColumnRenamed("email","GitContributor") \
    .withColumnRenamed("commit","GitCommit") \
    .withColumnRenamed("seconds","ts") 
committed = committed \
    .filter(committed["GitContributor"]!="") \
    .withColumn("ts", committed["ts"].cast(T.IntegerType())) \
    .dropDuplicates(["GitContributor","GitCommit"])

writted_in = languages.withColumn("language", F.explode(languages["language"]))
writted_in = writted_in \
    .withColumn("GitLanguage", writted_in["language.name"]) \
    .withColumn("bytes", writted_in["language.bytes"].cast(T.IntegerType())) \
    .withColumnRenamed("repo_name", "GitRepository") \
    .select("GitRepository", "GitLanguage", "bytes")


preproc_end_time = time.time()
preproc_time = (preproc_end_time - preprc_start_time)

In [None]:

writing_start_time = time.time()
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitRepository"

spark_write(SparkConnector.TIGERGRAPH, git_repositories, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitContributor"

spark_write(SparkConnector.TIGERGRAPH, git_contributor, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitLanguage"

spark_write(SparkConnector.TIGERGRAPH, git_languages, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitLicense"

spark_write(SparkConnector.TIGERGRAPH, git_licenses, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "vertex GitCommit"

spark_write(SparkConnector.TIGERGRAPH, git_commits, "Append", options=options)
## Writing the relationships in the graph
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge BELONGS_TO"

spark_write(SparkConnector.TIGERGRAPH, belongs_to, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge CONTAINS"

spark_write(SparkConnector.TIGERGRAPH, contains, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge HAS"

spark_write(SparkConnector.TIGERGRAPH, has, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge PARENT"

spark_write(SparkConnector.TIGERGRAPH, parent, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge WRITTEN_IN"

spark_write(SparkConnector.TIGERGRAPH, writted_in, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge AUTHOR"

spark_write(SparkConnector.TIGERGRAPH, author, "Append", options=options)
options = get_default_options(SparkConnector.TIGERGRAPH)

options["dbtable"] = "edge COMMITTED"

spark_write(SparkConnector.TIGERGRAPH, committed, "Append", options=options)

writing_end_time = time.time()
writing_time = (writing_end_time - writing_start_time)

In [7]:
print(f"Load time: {load_time} sec")
print(f"Preprocessing time: {preproc_time} sec")
print(f"Writing time: {writing_time} sec")

Load time: 21.67391324043274 sec
Preprocessing time: 0.8274252414703369 sec
Writing time: 1609.4341628551483 sec


In [None]:
# Scenario 1
N = 10
start_time = time.time()
options = get_default_options(SparkConnector.TIGERGRAPH)
options["dbtable"] = f'query TopNAuthorsWithMoreContributes(N={N})'
top10contributors = spark_read(SparkConnector.TIGERGRAPH, session, options=options)
end_time = time.time()
print(f"Scenario 1: {end_time - start_time} sec")

In [None]:
# Scenario 2:
LANGUAGE = "C++"
PERCENTAGE = 0.5

start_time = time.time()
options = get_default_options(SparkConnector.TIGERGRAPH)
options["dbtable"] = f'query ReposWithMoreThenPercentageOnLenguage(perc={PERCENTAGE}, lang="{LANGUAGE}")'
repos = spark_read(SparkConnector.TIGERGRAPH, session, options=options)
bytesPercentageInRepos = spark_read(SparkConnector.TIGERGRAPH, session, options=options)
end_time = time.time()
print(f"Scenario 2: {end_time - start_time} sec")

In [None]:
# Scenario 3:
REPO_NAME = "tensorflow/tensorflow"
start_time = time.time()
options = get_default_options(SparkConnector.TIGERGRAPH)
options["dbtable"] = f'query CountMergeCommits(repo_name="{REPO_NAME}")'
repos = spark_read(SparkConnector.TIGERGRAPH, session, options=options)
end_time = time.time()
print(f"Scenario 3: {end_time - start_time} sec")

In [12]:
# Scenario 4 - Louvain:
start_time = time.time()
v_type_set = '["GitContributor", "GitRepository", "GitCommit"]'
e_type_set = '["CONTAINS", "PARENT", "BELONGS_TO"]'

options = get_default_options(SparkConnector.TIGERGRAPH)
options["dbtable"] = f'query tg_label_prop( \
    v_type_set={v_type_set}, \
    e_type_set={e_type_set}, \
    maximum_iteration=250, \
    print_limit=-1, \
    print_results=TRUE, file_path="", result_attribute="")'
communities = spark_read(SparkConnector.TIGERGRAPH, session, options=options)
end_time = time.time()
print(f"Scenario 4 louvain: {end_time - start_time} sec")

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [13]:
# Scenario 5: page rank contributori
start_time = time.time()
options = get_default_options(SparkConnector.TIGERGRAPH)
options["dbtable"] = f'query tg_pagerank("GitCommit", "PARENT", 0.001, 25, 0.85, 10, _, _, _, _)'
top10contributors = spark_read(SparkConnector.TIGERGRAPH, session, options=options)
end_time = time.time()
print(f"Scenario 5: {end_time - start_time} sec")

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
session.sparkContext.stop()
session.stop()