# Gold Layer

In [0]:
%run "/Shared/20231023 Demos/ADLS Setup Variables_SP"

In [0]:
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType, BooleanType, DateType, TimestampType, ArrayType
from pyspark.sql.functions import col, explode, expr
from langdetect import detect
from sparknlp.base import *
from sparknlp.annotator import *

[SecretScope(name='20231023 ADLS Access'),
 SecretScope(name='better-scope'),
 SecretScope(name='databricks-app-kv'),
 SecretScope(name='databricks-application'),
 SecretScope(name="Jordan's Access"),
 SecretScope(name='SecretScopeTeam2'),
 SecretScope(name='team1-keyvault'),
 SecretScope(name='tobedeleted')]

### Connect to Azure Data Lake Storage

In [0]:
storage_account = '20231023desa'
container_name = "team3-project2"
paths=[f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/actor', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/org', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/repo', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/commit_comment_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/sponsorship_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/create_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/delete_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/fork_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/push_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/pull_request_review_thread_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/pull_request_review_comment_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/pull_request_review_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/pull_request_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/gollum_event', 
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/issue_comment_event',
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/issues_event',
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/member_event',
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/public_event',
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/release_event',
       f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/SilverLayer/watch_event']

### Set Azure Service Principal

In [0]:
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

### Applying Schemas


In [0]:
event_schema_basic = StructType([
     StructField("event_id", LongType(), False), 
     StructField("type", StringType(), True),
     StructField("actor_id", LongType(), True),
     StructField("org_id", LongType(), True),
     StructField("repo_id", LongType(), True),
     StructField("public", BooleanType(), True),
     StructField("created_at", TimestampType(), True),
])

event_schema_with_action = StructType([
    StructField("event_id", LongType(), False), 
    StructField("type", StringType(), True),
    StructField("actor_id", LongType(), True),
    StructField("org_id", LongType(), True),
    StructField("repo_id", LongType(), True),
    StructField("public", BooleanType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("action", StringType(), True)  
])

event_schema_with_ref_type = StructType([
    StructField("event_id", LongType(), False), 
    StructField("type", StringType(), True),
    StructField("actor_id", LongType(), True),
    StructField("org_id", LongType(), True),
    StructField("repo_id", LongType(), True),
    StructField("public", BooleanType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("ref_type", StringType(), True)  
])

push_event_schema = StructType([
    StructField("event_id", LongType(), False), 
    StructField("type", StringType(), True),
    StructField("actor_id", LongType(), True),
    StructField("org_id", LongType(), True),
    StructField("repo_id", LongType(), True),
    StructField("public", BooleanType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("num_commits", IntegerType(), True),
    StructField("payload_ref", StringType(), True),
    StructField("author_email", StringType(), True),
    StructField("author_name", StringType(), True),
    StructField("message", ArrayType(StringType(), True), True)
])
repo_schema = StructType([
    StructField("repo_id", LongType(), False), 
    StructField("repo_name", StringType(), False),
    StructField("created_at", TimestampType(), True)
])
org_schema = StructType([
    StructField("org_id", LongType(), False), 
    StructField("org_login", StringType(), False),
    StructField("created_at", TimestampType(), True)
])
actor_schema = StructType([
    StructField("actor_id", LongType(), False), 
    StructField("actor_login", StringType(), False),
    StructField("created_at", TimestampType(), True)
])



### Read to DataFrames from silver layer

In [0]:
actor_df = spark.read.format('parquet').schema(actor_schema).load(paths[0])
org_df = spark.read.format('parquet').schema(org_schema).load(paths[1])
repo_df = spark.read.format('parquet').schema(repo_schema).load(paths[2])
commit_comment_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[3])
#sponsorship_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[4])
create_event_df = spark.read.format('parquet').schema(event_schema_with_ref_type).load(paths[5])
delete_event_df = spark.read.format('parquet').schema(event_schema_with_ref_type).load(paths[6])
fork_event_df = spark.read.format('parquet').schema(event_schema_basic).load(paths[7])
push_event_df = spark.read.format('parquet').schema(push_event_schema).load(paths[8])
#pull_review_request_thread_event_df = spark.read.format('parquet').load(paths[9])
pull_review_request_comment_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[10])
pull_review_request_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[11])
pull_request_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[12])
gollum_event_df = spark.read.format('parquet').schema(event_schema_basic).load(paths[13])
issue_comment_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[14])
issues_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[15])
member_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[16])
public_event_df = spark.read.format('parquet').schema(event_schema_basic).load(paths[17])
release_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[18])
watch_event_df = spark.read.format('parquet').schema(event_schema_with_action).load(paths[19])

#commented out certain dfs since there were no rows for that event type, thus directory has no contents and cannot be loaded

### Exploding Message Array in push_event_df

In [0]:
push_event_df_exploded = push_event_df.select(
    "event_id", "type", "actor_id", "org_id", "repo_id", "public",
    "created_at", "num_commits", "payload_ref", "author_email", "author_name",
    explode("message").alias("full_message")
)

### Creating main table

In [0]:
main_df = (commit_comment_event_df
           .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at")
           .union(create_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(delete_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(fork_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(push_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(pull_review_request_comment_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(pull_review_request_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(pull_request_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(gollum_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(issue_comment_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(issues_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(member_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(public_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(release_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
           .union(watch_event_df
                  .select("actor_id", "org_id", "repo_id", "event_id", "type", "created_at"))
          )


### Creating date table

In [0]:
from pyspark.sql.functions import col, monotonically_increasing_id

date_df = main_df.select("created_at").distinct()
date_df = date_df.orderBy("created_at")
date_df = date_df.withColumn("date_id", monotonically_increasing_id())
main_df = main_df.join(date_df, on="created_at", how="left_outer")

### Creating commits table

In [0]:
commits_df = push_event_df_exploded.select("event_id", "full_message")

### Creating push_event table

In [0]:
push_event_df = push_event_df.select("event_id", "num_commits", "payload_ref")

### Writing to Gold Layer

In [0]:
gold_client_id = dbutils.secrets.get('20231023 ADLS Access', 'client-id')
gold_tenant_id = dbutils.secrets.get('20231023 ADLS Access', 'tenant-id')
gold_sp_secret = dbutils.secrets.get('20231023 ADLS Access', 'sp-secret')

In [0]:
gold_conn_str = f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer'

In [0]:
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", gold_client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", gold_sp_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{gold_tenant_id}/oauth2/token")

In [0]:
def save_to_gold (df, event_name):
    df.write.format("parquet")\
    .option("header", True)\
    .mode("overwrite")\
    .save(gold_conn_str + f"/{event_name}")


In [0]:
save_to_gold(main_df, "main")
save_to_gold(date_df, "date")
save_to_gold(commits_df, "commits")
save_to_gold(push_event_df, "push_event")
save_to_gold(actor_df, "actor")
save_to_gold(repo_df, "repo")