
# Analysis


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rand


## Connect to Azure Data Lake Storage


In [0]:
storage_account = '20231023desa'
container_name = "team3-project2"
paths=[
    f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer/main',
    f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer/date',
    f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer/commits',
    f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer/push_event',
    f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer/actor',
    f'abfss://{container_name}@{storage_account}.dfs.core.windows.net/GoldLayer/repo'
]

client_id = dbutils.secrets.get('20231023 ADLS Access', 'client-id')
client_secret = dbutils.secrets.get('20231023 ADLS Access', 'sp-secret')
tenant_id = dbutils.secrets.get('20231023 ADLS Access', 'tenant-id')


### Set Azure Service Principal


In [0]:
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


### Read to DataFrames from Gold Layer


In [0]:
main_df = spark.read.format('parquet').load(paths[0])
date_df = spark.read.format('parquet').load(paths[1])
commits_df = spark.read.format('parquet').load(paths[2])
push_event_df = spark.read.format('parquet').load(paths[3])
actor_df = spark.read.format('parquet').load(paths[4])
repo_df = spark.read.format('parquet').load(paths[5])

In [0]:
main_df = main_df.drop("created_at")


## Data aggregated by type of GitHub event per day

In [0]:
events_per_day = main_df.groupBy(
    F.col("type"),
    F.dayofmonth("created_at").alias("day")
).count()

In [0]:
events_per_day.display()

type,day,count
PullRequestEvent,30,145579
CreateEvent,1,261401
PullRequestEvent,23,154579
DeleteEvent,17,152139
PullRequestEvent,1,159441
DeleteEvent,21,148339
DeleteEvent,24,146778
DeleteEvent,10,136403
DeleteEvent,31,145307
DeleteEvent,25,132262


Databricks visualization. Run in Databricks to view.


## PushEvent data aggregated by ref type – whether the commit is on the main branch

In [0]:
filtered_push_events = push_event_df.withColumn(
    "on_main_branch",
    F.when(
        (F.col("payload_ref").endswith("main")) | (F.col("payload_ref").endswith("master")),
        "On Main Branch"
    ).otherwise("Not On Main Branch")
)
agg_counts = filtered_push_events.groupBy("on_main_branch").count()
display(agg_counts)

on_main_branch,count
Not On Main Branch,13241898
On Main Branch,37653225


Databricks visualization. Run in Databricks to view.


## Breakdown of events by type

In [0]:
events_by_type = main_df.groupBy("type").count()

In [0]:
events_by_type = events_by_type.orderBy("count", ascending = False)
events_by_type.display()

type,count
PushEvent,50895123
CreateEvent,14717172
PullRequestEvent,9311768
IssueCommentEvent,5328832
WatchEvent,4996749
DeleteEvent,3623692
PullRequestReviewEvent,2500358
IssuesEvent,2280108
ForkEvent,1773263
PullRequestReviewCommentEvent,1438337


Databricks visualization. Run in Databricks to view.


## Number of commits per push event stats

In [0]:
commit_stats = push_event_df.describe("num_commits")

In [0]:
commit_stats.display()

summary,num_commits
count,50895123.0
mean,3.7137377583309896
stddev,40.37395330248807
min,0.0
max,1000.0



## User activity should be aggregated so that a filterable chart can be populated with breakdowns of user activity by week or month.

In [0]:
joined_df = main_df.join(date_df, main_df.date_id == date_df.date_id)

In [0]:
activity_with_type = (joined_df.alias("d")
                      .join(actor_df.alias("a"), "actor_id")
                      .groupBy("a.actor_id", "a.actor_login", "d.type",
                               F.weekofyear("d.created_at").alias("week"), 
                               F.month("d.created_at").alias("month"))
                      .count().withColumnRenamed("count", "activity_count"))


## Top 5 Users By Activity

In [0]:
total_activity_per_user = activity_with_type.groupBy("actor_login").sum("activity_count").withColumnRenamed("sum(activity_count)", "total_activity")
window_spec = Window.orderBy(F.desc("total_activity"))
top_5_users = total_activity_per_user.withColumn("rank", F.rank().over(window_spec)).filter("rank <= 5")
top_5_users_list = [row['actor_login'] for row in top_5_users.select("actor_login").collect()]

In [0]:
top_5_activity = activity_with_type.filter(activity_with_type.actor_login.isin(top_5_users_list))

In [0]:
display(top_5_activity)

actor_id,actor_login,type,week,month,activity_count
39814207,pull[bot],PushEvent,1,1,56807328160
39814207,pull[bot],PushEvent,3,1,62586340487
39814207,pull[bot],PushEvent,4,1,58488704659
39814207,pull[bot],PushEvent,2,1,63290234282
39814207,pull[bot],PushEvent,5,1,8966455927
39814207,pull[bot],PullRequestEvent,2,1,129584634157
39814207,pull[bot],PullRequestEvent,3,1,127118792372
39814207,pull[bot],PullRequestEvent,4,1,110590126504
39814207,pull[bot],PullRequestEvent,1,1,116529396412
39814207,pull[bot],PushEvent,52,1,10111279420


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.


## Breakdown of activity by project

In [0]:
activity_by_repo_and_type = (main_df
                             .groupBy("repo_id", "type")
                             .count()
                             .withColumnRenamed("count", "activity_count")
                             .orderBy("repo_id", "activity_count"))

total_activity_per_repo = (activity_by_repo_and_type
                           .groupBy("repo_id")
                           .sum("activity_count")
                           .withColumnRenamed("sum(activity_count)", "total_activity"))

activity_threshold = 1000
filtered_by_threshold = total_activity_per_repo.filter(F.col("total_activity") > activity_threshold)

In [0]:

sampled_repos = filtered_by_threshold.orderBy(rand()).limit(2)
sampled_repo_ids = [row['repo_id'] for row in sampled_repos.collect()]

filtered_activity_for_sampled_repos = activity_by_repo_and_type.filter(activity_by_repo_and_type.repo_id.isin(sampled_repo_ids))

activity_breakdown = (filtered_activity_for_sampled_repos
                      .groupBy("repo_id", "type")
                      .sum("activity_count")
                      .withColumnRenamed("sum(activity_count)", "activity_count")
                      .orderBy("repo_id", "type"))

In [0]:
display(activity_breakdown)

repo_id,type,activity_count
443589975,CreateEvent,2
443589975,PushEvent,1432
443946633,CreateEvent,2
443946633,ForkEvent,2
443946633,PullRequestEvent,2
443946633,PushEvent,29435
443946633,WatchEvent,7


Databricks visualization. Run in Databricks to view.