In [52]:
import json
import subprocess
import re
import os
import pandas as pd

In [53]:
owner = "2dot71mily"
repo = "toy_repo"
gh_repo_base = f"/repos/{owner}/{repo}"
output_data_dir = "output_data"
local_path = os.path.join(os.getcwd(), output_data_dir)

In [54]:
api_cmd_json_template = """gh api \
   -H "Accept: application/vnd.github+json" \
   -H "X-GitHub-Api-Version: 2022-11-28" \
    {cmd}?per_page=100"""


api_cmd_patch_template = """gh api \
   -H "Accept: application/vnd.github.v3.patch" \
   -H "X-GitHub-Api-Version: 2022-11-28" \
    {cmd}?per_page=100"""

In [55]:
html_commit_template = f"https://github.com/{owner}/{repo}/commit/{{sha}}"

event_cmd = f"{gh_repo_base}/events"
event_output_filename = f"{repo}_events.json"

commits_cmd_template = f"{gh_repo_base}/commits/{{sha}}"
commit_output_filename_template = f"{repo}_{{prefix}}_{{sha}}.patch"

pr_commits_cmd_template = f"{gh_repo_base}/pulls/{{pr}}/commits"
pr_output_filename_template = f"{repo}_pull_{{pr}}_commits.json"

checkruns_cmd_template = f"{gh_repo_base}/commits/{{sha}}/check-runs"
checkruns_output_filename_template = f"{repo}_pull_{{sha}}_checkruns.json"

In [64]:
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)


def get_cleaned_gh_cmd_output(
    cmd,
    output_filename,
    api_cmd_template=api_cmd_json_template,
    output_path=local_path,
    max_display_len=2000,
    verbose=False,
):
    gh_cmd = api_cmd_template.format(cmd=cmd)
    result = subprocess.run(gh_cmd, shell=True, capture_output=True, text=True)

    # Remove ANSI escape codes
    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
    clean_output = ansi_escape.sub("", result.stdout)

    create_directory(output_path)
    with open(os.path.join(output_path, output_filename), "w") as f:
        f.write(clean_output)

    if verbose:
        print(clean_output[:max_display_len])
        if len(clean_output) > max_display_len:
            print("<truncated>\n")

        print(f"Output written to {output_filename}\n\n")

    return clean_output

## Get all GitHub API Events from repo

In [65]:
event_json = get_cleaned_gh_cmd_output(
    f"{gh_repo_base}/events", f"{repo}_events.json", verbose=False
)
event_data = json.loads(event_json)

## Reload Event data into Pandas for post processing

In [66]:
data_df = pd.read_json(
    os.path.join(local_path, event_output_filename), orient="records"
)
len(data_df)

10

## Take a quick looks at the Event data overall and EventTypes captured

In [67]:
data_df.head(3)

Unnamed: 0,id,type,actor,repo,payload,public,created_at
0,41323740523,PushEvent,"{'id': 21292059, 'login': '2dot71mily', 'displ...","{'id': 847491164, 'name': '2dot71mily/toy_repo...","{'repository_id': 847491164, 'push_id': 199272...",True,2024-08-26 00:47:58+00:00
1,41323740434,PullRequestEvent,"{'id': 21292059, 'login': '2dot71mily', 'displ...","{'id': 847491164, 'name': '2dot71mily/toy_repo...","{'action': 'closed', 'number': 2, 'pull_reques...",True,2024-08-26 00:47:58+00:00
2,41323737085,PullRequestEvent,"{'id': 21292059, 'login': '2dot71mily', 'displ...","{'id': 847491164, 'name': '2dot71mily/toy_repo...","{'action': 'opened', 'number': 2, 'pull_reques...",True,2024-08-26 00:47:42+00:00


In [68]:
data_df.type.value_counts()

type
PullRequestEvent    4
CreateEvent         4
PushEvent           2
Name: count, dtype: int64

In [69]:
data_by_type = {
    event_type: data_df[data_df.type == event_type]
    for event_type in data_df.type.unique().tolist()
}

## Checking PullRequestEvents first 

In [71]:
shas = {}
for i, pr_event in data_by_type["PullRequestEvent"].iterrows():
    print(f" ******** PR{pr_event.payload['number']} pr_event iter {i} ******** ")
    shas["merge_commit"] = pr_event.payload["pull_request"]["merge_commit_sha"]
    shas["head"] = pr_event.payload["pull_request"]["head"]["sha"]
    shas["base"] = pr_event.payload["pull_request"]["base"]["sha"]

    for commit_type, sha in shas.items():
        print(f"{commit_type}: {html_commit_template.format(sha=sha)}")
        try:
            out = get_cleaned_gh_cmd_output(
                commits_cmd_template.format(sha=sha),
                output_filename=commit_output_filename_template.format(
                    prefix=f"pr_event_{commit_type}_commit", sha=sha
                ),
                api_cmd_template=api_cmd_patch_template,
                verbose=True,
            )
        except Exception as e:
            print(f"Failed to retrieve data for sha: {sha}")
            print(e)
            continue

 ******** PR2 pr_event iter 1 ******** 
merge_commit: https://github.com/2dot71mily/toy_repo/commit/a4f9fa6f698db44e1c44e065aca87e73773d2742
From a4f9fa6f698db44e1c44e065aca87e73773d2742 Mon Sep 17 00:00:00 2001
From: Emily McMilin <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 17:47:57 -0700
Subject: [PATCH] B (#2)

* 3

* del 3

* 4
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4539bbf..7f2f028 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
 0
 1
 2
+4
\ No newline at end of file

Output written to toy_repo_pr_event_merge_commit_commit_a4f9fa6f698db44e1c44e065aca87e73773d2742.patch


head: https://github.com/2dot71mily/toy_repo/commit/673b77704f9908888716050414131ae3878d0a28
From 673b77704f9908888716050414131ae3878d0a28 Mon Sep 17 00:00:00 2001
From: emily <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 17:47:00 -0700
Subject: [PATCH] 4

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md

## Checking PushEvents as well

In [72]:
for i, push_event in data_by_type["PushEvent"].iterrows():
    print(f" ******** PushEvent iter {i} ******** ")
    for j, commit in enumerate(push_event.payload["commits"]):
        sha = commit["sha"]
        print(f"commit {j}: {html_commit_template.format(sha=sha)}")
        try:
            get_cleaned_gh_cmd_output(
                commits_cmd_template.format(sha=sha),
                output_filename=commit_output_filename_template.format(
                    prefix=f"push_event_{i}_commit_{j}", sha=sha
                ),
                api_cmd_template=api_cmd_patch_template,
                verbose=True,
            )
        except Exception as e:
            print(f"Failed to retrieve data for sha: {sha}")
            print(e)
            continue

 ******** PushEvent iter 0 ******** 
commit 0: https://github.com/2dot71mily/toy_repo/commit/a4f9fa6f698db44e1c44e065aca87e73773d2742
From a4f9fa6f698db44e1c44e065aca87e73773d2742 Mon Sep 17 00:00:00 2001
From: Emily McMilin <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 17:47:57 -0700
Subject: [PATCH] B (#2)

* 3

* del 3

* 4
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4539bbf..7f2f028 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
 0
 1
 2
+4
\ No newline at end of file

Output written to toy_repo_push_event_0_commit_0_a4f9fa6f698db44e1c44e065aca87e73773d2742.patch


 ******** PushEvent iter 4 ******** 
commit 0: https://github.com/2dot71mily/toy_repo/commit/137552281732197116438c69031d265f036dd3ce
From 137552281732197116438c69031d265f036dd3ce Mon Sep 17 00:00:00 2001
From: emily <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 17:43:36 -0700
Subject: [PATCH] 0

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --

# PR sub-commits

In [73]:
# Exracting PR numbers from PullRequestEvents json to find PR sub-commits
unique_pr_numbers = {
    row.payload["number"] for _, row in data_by_type["PullRequestEvent"].iterrows()
}

In [74]:
for pr in unique_pr_numbers:
    print(f" ******** Processing PR: {pr} ******** ")
    print(f"https://github.com/{owner}/{repo}/pull/{pr}")
    pr_json = get_cleaned_gh_cmd_output(
        pr_commits_cmd_template.format(pr=pr),
        output_filename=pr_output_filename_template.format(pr=pr),
        api_cmd_template=api_cmd_json_template,
    )
    pr_data = json.loads(pr_json)
    for commit in pr_data:
        print(" ****** commit patch ****** ")
        sha = commit["sha"]
        print(f" **** Is commit sha in EventsAPI json? {sha in event_json} **** ")

        print("sha: ", sha)
        print("html_url: ", commit["html_url"])
        get_cleaned_gh_cmd_output(
            commits_cmd_template.format(sha=sha),
            output_filename=commit_output_filename_template.format(
                prefix=f"pr_{pr}", sha=sha
            ),
            api_cmd_template=api_cmd_patch_template,
            verbose=True,
        )

        print(" ****** commit checkrun? ****** ")
        get_cleaned_gh_cmd_output(
            checkruns_cmd_template.format(sha=sha),
            output_filename=checkruns_output_filename_template.format(sha=sha),
            api_cmd_template=api_cmd_json_template,
            verbose=True,
        )

 ******** Processing PR: 1 ******** 
https://github.com/2dot71mily/toy_repo/pull/1
 ****** commit patch ****** 
 **** Is commit sha in EventsAPI json? True **** 
sha:  137552281732197116438c69031d265f036dd3ce
html_url:  https://github.com/2dot71mily/toy_repo/commit/137552281732197116438c69031d265f036dd3ce
From 137552281732197116438c69031d265f036dd3ce Mon Sep 17 00:00:00 2001
From: emily <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 17:43:36 -0700
Subject: [PATCH] 0

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e69de29..573541a 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
+0

Output written to toy_repo_pr_1_137552281732197116438c69031d265f036dd3ce.patch


 ****** commit checkrun? ****** 
{
  "total_count": 0,
  "check_runs": []
}

Output written to toy_repo_pull_137552281732197116438c69031d265f036dd3ce_checkruns.json


 ****** commit patch ****** 
 **** Is commit sha in EventsAPI json? True **** 
sha:  818e3e46edc91dceb2d3ee54