In [1]:
from pyspark.sql import SparkSession
import random as rand

DATA_URL = "hdfs://namenode:9000//data-team"
SPARK_MASTER_URL = "spark://spark-master:7077"

In [2]:
session = SparkSession.builder \
        .appName("Get Sample") \
        .master(SPARK_MASTER_URL) \
        .config("spark.executor.memory", "3G") \
        .config("spark.authenticate", "false") \
        .getOrCreate()

### Reading from HDFS

In [3]:
files = session.read.json(f"{DATA_URL}/files.json")
files.printSchema()

root
 |-- id: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- path: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- repo_name: string (nullable = true)
 |-- symlink_target: string (nullable = true)



In [4]:
languages = session.read.json(f"{DATA_URL}/languages.json")
languages.printSchema()

root
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bytes: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [5]:
licences = session.read.json(f"{DATA_URL}/licenses.json")
licences.printSchema()

root
 |-- license: string (nullable = true)
 |-- repo_name: string (nullable = true)



In [6]:
commits = session.read.json(f"{DATA_URL}/commits.json")
commits.printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- commit: string (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- time_sec: string (nullable = true)
 |    |-- tz_offset: string (nullable = true)
 |-- difference: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- new_mode: string (nullable = true)
 |    |    |-- new_path: string (nullable = true)
 |    |    |-- new_repo: string (nullable = true)
 |    |    |-- new_sha1: string (nullable = true)
 |    |    |-- old_mode: string (nullable = true)
 |    |    

In [7]:
repositories = session.read.json(f"{DATA_URL}/repositories.json") 
repositories.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- watch_count: string (nullable = true)



# Select a small subset of the data (100 repos)

In [9]:
sample_repositories = repositories.limit(10)
sample_repositories.write.format("json").mode("overwrite").json(f"{DATA_URL}/sample_repositories")

In [10]:
sample_commits = commits.alias("comm") \
.join(sample_repositories.alias("repo"), 
    sample_repositories.repo_name == commits.repo) \
.select("comm.*")

sample_commits.write.format("json").mode("overwrite").json(f"{DATA_URL}/sample_commits")

In [11]:
sample_files = files.alias("file") \
    .join(sample_repositories.alias("repo"), 
        sample_repositories.repo_name == files.repo_name) \
    .select("file.*")

sample_files.write.format("json").mode("overwrite").json(f"{DATA_URL}/sample_files")

In [12]:
sample_languages = languages.alias("lang") \
    .join(sample_repositories.alias("repo"),
        sample_repositories.repo_name == languages.repo_name)\
    .select("lang.*")

sample_languages.write.format("json").mode("overwrite").json(f"{DATA_URL}/sample_languages")

In [13]:
sample_licences = licences.alias("lic") \
    .join(sample_repositories.alias("repo"),
        sample_repositories.repo_name == licences.repo_name)\
    .select("lic.*")
    
sample_licences.write.format("json").mode("overwrite").json(f"{DATA_URL}/sample_licences")

## Stop spark context and spark session

In [14]:

session.sparkContext.stop()
session.stop()