In [1]:
import sys
fileDir = "/home/jovyan/notebooks"
sys.path.append(fileDir)

from utilities import *
from github import Github
import json
from github import Auth
import pandas as pd
import time

In [2]:
session = SparkSession.builder \
    .appName("Data integration") \
    .master(SPARK_MASTER_URL) \
    .config("spark.executor.memory", "5G") \
    .config("spark.authenticate", "false") \
    .getOrCreate()

In [3]:
HDFS_URL = "hdfs://namenode:9000//data-team"
PREFIX = "sample_"  # "sample_" or ""

### Reading from HDFS

In [4]:
# cambia se ti serve il dataset completo
repositories = session.read.json(f"{HDFS_URL}/{PREFIX}repositories.json")
repositories_names = repositories.select("repo_name").rdd.flatMap(lambda x: x).collect()

### Retrieve additional data from the GitHub API and dump it to a CSV

In [5]:
# Import github token from json file
credentials = json.load(open(f'{fileDir}/credentials.json'))

# using an access token
auth = Auth.Token(credentials['token'])

# Public Web Github
g = Github(auth=auth)

In [6]:
features = ["forks_count", "default_branch",
            "open_issues_count", "created_at","stargazers_count",
            "language", "topics", "visibility"]
columns_names = ["repo_name"] + features

repo_data_df = pd.DataFrame(columns=columns_names)

for repo_name in repositories_names:
    attributes_map = collectAttributes(g.get_repo(repo_name), features)
    attributes = [attributes_map[f] for f in features]

    time.sleep(1.5) # avoid to exceed the rate limit of GitHub API

    new_row = [repo_name] + attributes

    repo_data_df.loc[len(repo_data_df)] = new_row

Following Github server redirection from /repos/firehol/netdata to /repositories/10744183
Following Github server redirection from /repos/ParsePlatform/parse-server to /repositories/50603846


In [None]:
spark_df = session.createDataFrame(repo_data_df)

# save to csv format in the hdfs
spark_df.repartition(1) \
        .withColumn("topics", spark_df["topics"].cast("string")) \
        .write.csv(f"{HDFS_URL}/{PREFIX}repo_API_data.csv", header=True, mode="overwrite")

In [None]:
session.sparkContext.stop()
session.stop()