In [38]:
import json
import subprocess
import re
import os
import pandas as pd

In [39]:
owner = "2dot71mily"
repo = "repo_baby"
gh_repo_base = f"/repos/{owner}/{repo}"
output_data_dir = "output_data"
local_path = os.path.join(os.getcwd(), output_data_dir)

In [40]:
api_cmd_json_template = """gh api \
   -H "Accept: application/vnd.github+json" \
   -H "X-GitHub-Api-Version: 2022-11-28" \
    {cmd}?per_page=100"""


api_cmd_patch_template = """gh api \
   -H "Accept: application/vnd.github.v3.patch" \
   -H "X-GitHub-Api-Version: 2022-11-28" \
    {cmd}?per_page=100"""

In [41]:
html_commit_template = f"https://github.com/{owner}/{repo}/commit/{{sha}}"

event_cmd = f"{gh_repo_base}/events"
event_output_filename = f"{repo}_events.json"

commits_cmd_template = f"{gh_repo_base}/commits/{{sha}}"
commit_output_filename_template = f"{repo}_{{prefix}}_{{sha}}.patch"

pr_commits_cmd_template = f"{gh_repo_base}/pulls/{{pr}}/commits"
pr_output_filename_template = f"{repo}_pull_{{pr}}_commits.json"

checkruns_cmd_template = f"{gh_repo_base}/commits/{{sha}}/check-runs"
checkruns_output_filename_template = f"{repo}_pull_{{sha}}_checkruns.json"

In [42]:
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)


def get_cleaned_gh_cmd_output(
    cmd,
    output_filename,
    api_cmd_template=api_cmd_json_template,
    output_path=local_path,
    max_display_len=400,
    verbose=False,
):
    gh_cmd = api_cmd_template.format(cmd=cmd)
    result = subprocess.run(gh_cmd, shell=True, capture_output=True, text=True)

    # Remove ANSI escape codes
    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
    clean_output = ansi_escape.sub("", result.stdout)

    create_directory(output_path)
    with open(os.path.join(output_path, output_filename), "w") as f:
        f.write(clean_output)

    if verbose:
        print(clean_output[:max_display_len])
        if len(clean_output) > max_display_len:
            print("<truncated>\n")

        print(f"Output written to {output_filename}\n\n")

    return clean_output

## Get all GitHub API Events from repo

In [43]:
event_json = get_cleaned_gh_cmd_output(
    f"{gh_repo_base}/events", f"{repo}_events.json", verbose=False
)
event_data = json.loads(event_json)

## Reload Event data into Pandas for post processing

In [44]:
data_df = pd.read_json(
    os.path.join(local_path, event_output_filename), orient="records"
)
len(data_df)

35

## Take a quick looks at the Event data overall and EventTypes captured

In [45]:
data_df.head(3)

Unnamed: 0,id,type,actor,repo,payload,public,created_at
0,41327020234,PushEvent,"{'id': 21292059, 'login': '2dot71mily', 'displ...","{'id': 846400152, 'name': '2dot71mily/repo_bab...","{'repository_id': 846400152, 'push_id': 199289...",True,2024-08-26 04:33:08+00:00
1,41327020091,PullRequestEvent,"{'id': 21292059, 'login': '2dot71mily', 'displ...","{'id': 846400152, 'name': '2dot71mily/repo_bab...","{'action': 'closed', 'number': 7, 'pull_reques...",True,2024-08-26 04:33:07+00:00
2,41326983661,PushEvent,"{'id': 21292059, 'login': '2dot71mily', 'displ...","{'id': 846400152, 'name': '2dot71mily/repo_bab...","{'repository_id': 846400152, 'push_id': 199289...",True,2024-08-26 04:31:15+00:00


In [46]:
data_df.type.value_counts()

type
PullRequestEvent    14
PushEvent           11
CreateEvent          9
PublicEvent          1
Name: count, dtype: int64

In [47]:
data_by_type = {
    event_type: data_df[data_df.type == event_type]
    for event_type in data_df.type.unique().tolist()
}

## Checking PullRequestEvents first 

In [48]:
shas = {}
for i, pr_event in data_by_type["PullRequestEvent"].iterrows():
    print(f" ******** PR{pr_event.payload['number']} pr_event iter {i} ******** ")
    shas["merge_commit"] = pr_event.payload["pull_request"]["merge_commit_sha"]
    shas["head"] = pr_event.payload["pull_request"]["head"]["sha"]
    shas["base"] = pr_event.payload["pull_request"]["base"]["sha"]

    for commit_type, sha in shas.items():
        print(f"{commit_type}: {html_commit_template.format(sha=sha)}")
        try:
            out = get_cleaned_gh_cmd_output(
                commits_cmd_template.format(sha=sha),
                output_filename=commit_output_filename_template.format(
                    prefix=f"pr_event_{commit_type}_commit", sha=sha
                ),
                api_cmd_template=api_cmd_patch_template,
                verbose=True,
            )
        except Exception as e:
            print(f"Failed to retrieve data for sha: {sha}")
            print(e)
            continue

 ******** PR7 pr_event iter 1 ******** 
merge_commit: https://github.com/2dot71mily/repo_baby/commit/e8e3e3560a80eecc68ab5439fc67d82030ba0083
From e8e3e3560a80eecc68ab5439fc67d82030ba0083 Mon Sep 17 00:00:00 2001
From: Emily McMilin <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 21:33:06 -0700
Subject: [PATCH] add workflow (#7)

* add workflow

* add logistic regression

* add test logistic regression

* add e2e

* add e2e test
---
 .github/workflows/python-app.yml | 39 +++++++++++++++++++++++++++
 e2e_logistic_regression.py       | 4
<truncated>

Output written to repo_baby_pr_event_merge_commit_commit_e8e3e3560a80eecc68ab5439fc67d82030ba0083.patch


head: https://github.com/2dot71mily/repo_baby/commit/58f63591312000428772626e100c6c801d0d4ea0
From 58f63591312000428772626e100c6c801d0d4ea0 Mon Sep 17 00:00:00 2001
From: emily <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 21:31:11 -0700
Subject: [PATCH] add e2e test

---
 test_e2e_logistic_regression.py | 6 ++++++
 1 file changed, 6

## Checking PushEvents as well

In [49]:
for i, push_event in data_by_type["PushEvent"].iterrows():
    print(f" ******** PushEvent iter {i} ******** ")
    for j, commit in enumerate(push_event.payload["commits"]):
        sha = commit["sha"]
        print(f"commit {j}: {html_commit_template.format(sha=sha)}")
        try:
            get_cleaned_gh_cmd_output(
                commits_cmd_template.format(sha=sha),
                output_filename=commit_output_filename_template.format(
                    prefix=f"push_event_{i}_commit_{j}", sha=sha
                ),
                api_cmd_template=api_cmd_patch_template,
                verbose=True,
            )
        except Exception as e:
            print(f"Failed to retrieve data for sha: {sha}")
            print(e)
            continue

 ******** PushEvent iter 0 ******** 
commit 0: https://github.com/2dot71mily/repo_baby/commit/e8e3e3560a80eecc68ab5439fc67d82030ba0083
From e8e3e3560a80eecc68ab5439fc67d82030ba0083 Mon Sep 17 00:00:00 2001
From: Emily McMilin <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 21:33:06 -0700
Subject: [PATCH] add workflow (#7)

* add workflow

* add logistic regression

* add test logistic regression

* add e2e

* add e2e test
---
 .github/workflows/python-app.yml | 39 +++++++++++++++++++++++++++
 e2e_logistic_regression.py       | 4
<truncated>

Output written to repo_baby_push_event_0_commit_0_e8e3e3560a80eecc68ab5439fc67d82030ba0083.patch


 ******** PushEvent iter 2 ******** 
commit 0: https://github.com/2dot71mily/repo_baby/commit/5dc0480ed997d2bfeb40e79de900e71e097632b4
From 5dc0480ed997d2bfeb40e79de900e71e097632b4 Mon Sep 17 00:00:00 2001
From: emily <emily.mcmilin@gmail.com>
Date: Sun, 25 Aug 2024 21:31:01 -0700
Subject: [PATCH] add e2e

---
 e2e_logistic_regression.py | 46 +++++++

# PR sub-commits

In [50]:
# Exracting PR numbers from PullRequestEvents json to find PR sub-commits
unique_pr_numbers = {
    row.payload["number"] for _, row in data_by_type["PullRequestEvent"].iterrows()
}

In [51]:
for pr in unique_pr_numbers:
    print(f" ******** Processing PR: {pr} ******** ")
    print(f"https://github.com/{owner}/{repo}/pull/{pr}")
    pr_json = get_cleaned_gh_cmd_output(
        pr_commits_cmd_template.format(pr=pr),
        output_filename=pr_output_filename_template.format(pr=pr),
        api_cmd_template=api_cmd_json_template,
    )
    pr_data = json.loads(pr_json)
    for commit in pr_data:
        print(" ****** commit patch ****** ")
        sha = commit["sha"]
        print(f" **** Is commit sha in EventsAPI json? {sha in event_json} **** ")

        print("sha: ", sha)
        print("html_url: ", commit["html_url"])
        get_cleaned_gh_cmd_output(
            commits_cmd_template.format(sha=sha),
            output_filename=commit_output_filename_template.format(
                prefix=f"pr_{pr}", sha=sha
            ),
            api_cmd_template=api_cmd_patch_template,
            verbose=True,
        )

        print(" ****** commit checkrun? ****** ")
        get_cleaned_gh_cmd_output(
            checkruns_cmd_template.format(sha=sha),
            output_filename=checkruns_output_filename_template.format(sha=sha),
            api_cmd_template=api_cmd_json_template,
            verbose=True,
        )

 ******** Processing PR: 1 ******** 
https://github.com/2dot71mily/repo_baby/pull/1
 ****** commit patch ****** 
 **** Is commit sha in EventsAPI json? True **** 
sha:  c301e5404dcbd7a693473f9579b12b968c622853
html_url:  https://github.com/2dot71mily/repo_baby/commit/c301e5404dcbd7a693473f9579b12b968c622853
From c301e5404dcbd7a693473f9579b12b968c622853 Mon Sep 17 00:00:00 2001
From: emily <emily.mcmilin@gmail.com>
Date: Thu, 22 Aug 2024 23:08:16 -0700
Subject: [PATCH] add math

---
 math_operations.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 math_operations.py

diff --git a/math_operations.py b/math_operations.py
new file mode 100644
index 0000000..f7f69b3
--- /dev/nul
<truncated>

Output written to repo_baby_pr_1_c301e5404dcbd7a693473f9579b12b968c622853.patch


 ****** commit checkrun? ****** 
{
  "total_count": 0,
  "check_runs": []
}

Output written to repo_baby_pull_c301e5404dcbd7a693473f9579b12b968c622853_checkruns.json


 ****** commit patch ******