# Pull request scraper

## GitHub credentials
A private access token is necessary to make use of less restrictive API limits.

In [1]:
from github import RateLimitExceededException, Github

# Providing access token
access_token = ""
g = Github(login_or_token=access_token)

# Confirm your login is successful
user = g.get_user()
print(f"Authenticated as: {user.login}")

Request GET /user failed with 403: Forbidden
Setting next backoff to 82.925177s


Authenticated as: AbelvdTil


## Files

In [3]:
import os
 
STEP4_TFCOMMITS = os.path.join("data", "previous-study", "step4-tf-commits.json") 
COMMIT_LABELS = os.path.join("data", "process-labeled-commits", "full-commit-labels.json") 

STEP5_TF_REPOS_WITH_PR = os.path.join("data", "pullrequest-scraping", "step5-tf-repos-with-pr.json")
STEP6_TF_REPOS_COMMITS = os.path.join("data", "pullrequest-scraping", "step6-tf-repos-commits.json")
STEP6A_TF_REPOS_RELEVANT_COMMITS = os.path.join("data", "pullrequest-scraping", "step6a-tf-repos-relevant-commits.json")
STEP7_TF_REPOS_WITH_TF_PR = os.path.join("data", "pullrequest-scraping", "step7-tf-repos-with-tf-pr.json")
STEP8_TF_KEYWORD_PR = os.path.join("data", "pullrequest-scraping", "step8-tf-keyword-pr.json")
STEP9_TF_PR_DATASET = os.path.join("data", "pullrequest-scraping", "step9-tf-pr-dataset.json")

## File seperator and combinator

Files over 100MB are not stored on GitHub, therefore we need to seperate large files into smaller ones.
Any step5 and step7 files can be split into multiple smaller ones and be combined together.

The repositories are split on pull request level. The first pull request is stored in part 1, the 2nd in part 2 etc. This will make sure that it is fairly equally distributed.

Each pull request is accomodated with the url of the repository. Therefore it is possible to reconstruct the original file again.

In [4]:
import json
import math

FILE_TO_SEPERATE = STEP5_TF_REPOS_WITH_PR

nr_parts = 10

### File seperator

In [46]:
parts_data = []

for i in range(nr_parts):
    parts_data.append([])

file = open(FILE_TO_SEPERATE)
seperator_data = json.load(file)

count = 0
for repository in seperator_data:
    for pull_request in repository["pull_requests"]:
        part = (count % nr_parts)
        count += 1
        parts_data[part].append({"repo_url": repository["url"], "pull_request": pull_request})

for i in range(nr_parts):
    with open(FILE_TO_SEPERATE.split(".")[0] + "-part-" + str(i+1) + ".json", "w") as outfile:
        json.dump(parts_data[i], outfile)

### File combinator

In [5]:
url_dict = {}
combinator_data = []
index_count = 0
for part in range(nr_parts):
    part_file = open(FILE_TO_SEPERATE.split(".")[0] + "-part-" + str(part+1) + ".json", "r")
    part_data = json.load(part_file)

    for nugget in part_data:
        # find repo using url
        index = url_dict.get(nugget["repo_url"], None)

        if (index == None):
            repo = {"url": nugget["repo_url"], "pull_requests": []}
            combinator_data.append(repo)
            
            url_dict[nugget["repo_url"]] = index_count
            index_count += 1
        else:
            repo = combinator_data[index]
            
        repo["pull_requests"].append(nugget["pull_request"])

with open(FILE_TO_SEPERATE, "w") as outfile:
    json.dump(combinator_data, outfile) 

## STEP 5: Pull request scraping script

For each repository, get all pull request data. This includes PR Title, description, (review) comments and commit hashes. Exclude any repositories that do not have pull requests.

In [33]:
repo_url = "https://github.com/sailthru/terraform-kubernetes-ambassador.git"


# Get the repo object from the url
split_list = repo_url.split("/")
actual_url = (split_list[3]+ '/' + split_list[4]).split('.git')[0]
repo = g.get_repo(actual_url)


pr = repo.get_pull(7)

# Get required info for pull requests
pull_requests_dict = []


# retrieve all review comments, not required if there are none.
comments = []

for review in pr.get_reviews():
    if (review.body.strip() != ""):
        comments.append(review.body)

for review_comment in pr.get_review_comments():
    if (review_comment.body.strip() != ""):
        comments.append(review_comment.body)

for comment in pr.get_issue_comments():
    if (comment.body.strip() != ""):
        comments.append(comment.body)

print(comments)



['nice...', "I think we'll need to ignore the replica count on the deployment, as this is something the pod autoscaler will adjust. E.g:\r\n\r\n```\r\nlifecycle {\r\n  ignore_changes = [spec.0.replicas]\r\n}\r\n```\r\n\r\nUnfortunately that probably means we'd have to either always enable the autoscaler, or create a separate version/copy of the deployment which has the ignore_changes. Keen for feedback on this!", "I'm thinking I'll just always enable autoscaling and set the default min/max pods to 1/1 - this should make it the same as the existing behaviour.", 'Not sure I understand, are we not just turning on autoscaling for ambassador? I think it would feel cleaner to have it enabled with min/max set to 1/5.\r\n\r\nYes will probably have to add the ignore replicas (like kinesis-autoscaler?)', "`ignore_changes` can't be set dynamically in terraform, so it's not very easy to get the `autoscaling_enable` var to work without creating a whole separate deployment with the `ignore_changes` 

In [8]:
# SETTINGS
check_limit_every_x_calls = 5
api_limit_buffer = 10
api_calls_per_debug = 500

# INITIALIZATION

import json
import datetime
import time
import os

# INITIALIZATION
terraform_output = open(STEP4_TFCOMMITS)
step4_output = json.load(terraform_output)

# Retrieve data from previous run
try:
    previous_run = open(os.path.join("data", "pullrequest-scraping", "missing-review-comments.json"))
    repoData_dict = json.load(previous_run)
except FileNotFoundError as e:
    repoData_dict = []

iteration = 0
calls_till_next_debug = 0
calls_till_limit_checkup = 0

# Check for api limits, also periodically calls print debug.
def CheckForApiLimit():
    global calls_till_limit_checkup
    global calls_till_next_debug
    global api_calls_per_debug
    global api_limit_buffer

    # check for limit
    if (calls_till_limit_checkup == 0):
        core_limit = g.get_rate_limit().core

        # sleep when exceeded api core limit
        if (core_limit.remaining <= api_limit_buffer):
            time_to_sleep = core_limit.raw_data['reset'] - time.time() + 1
            print("Rate limit exceeded, sleeping for", time_to_sleep, "seconds.", "Actual remaining calls", core_limit.remaining)
            time.sleep(time_to_sleep)

        calls_till_limit_checkup = check_limit_every_x_calls
    
    calls_till_limit_checkup -= 1

    # check for debug
    if (calls_till_next_debug == 0):
        PrintDebug()
        calls_till_next_debug = api_calls_per_debug

    calls_till_next_debug -= 1

# Prints debug message
def PrintDebug():
    global iteration
    global repo_url

    print(datetime.datetime.now().strftime("%H:%M:%S"), ":", 
              "current iteration:", iteration, 
              "url:", repo_url)

# Pull request scraping script
for rp in step4_output["repositories"]:
    try:
        iteration += 1
            
        repo_url = rp["name"]

        # skip already scraped repositories
        if any(d["url"] == repo_url for d in repoData_dict):
            continue

        # Get the repo object from the url
        split_list = repo_url.split("/")
        actual_url = (split_list[3]+ '/' + split_list[4]).split('.git')[0]
        repo = g.get_repo(actual_url)
        
        # Get required info for pull requests
        pull_requests_dict = []
        pull_requests = repo.get_pulls(state="closed")

        for pr in pull_requests:

            # retrieve all review comments, not required if there are none.
            comments = []

            if (pr.review_comments > 0):
                for review_comment in pr.get_review_comments():
                    if (review_comment.body.strip() != ""):
                        comments.append(review_comment.body)
                CheckForApiLimit()

            if (comments != []):
                pull_requests_dict.append({"url": pr.html_url, "comments": comments})
        
            CheckForApiLimit()        
        repoData_dict.append({"url": repo_url, "pull_requests": pull_requests_dict});

        with open( os.path.join("data", "pullrequest-scraping", "missing-review-comments.json") , "w") as outfile:
            json.dump(repoData_dict, outfile)
    except Exception as e:
        print("exception:", e)

exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message

Following Github server redirection from /repos/GlueOps/terraform-gcp-organization-bootstrap to /repositories/479564327


22:47:54 : current iteration: 428 url: https://github.com/GlueOps/terraform-gcp-organization-bootstrap.git
22:54:20 : current iteration: 440 url: https://github.com/terraform-google-modules/terraform-example-foundation.git
23:00:41 : current iteration: 440 url: https://github.com/terraform-google-modules/terraform-example-foundation.git
23:07:09 : current iteration: 449 url: https://github.com/UKHomeOffice/dq-tf-infra.git
23:13:33 : current iteration: 456 url: https://github.com/UKHomeOffice/dq-tf-apps.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
23:20:01 : current iteration: 474 url: https://github.com/alphagov/tech-ops.git
23:26:17 : current iteration: 479 url: https://github.com/gpii-ops/gpii-infra.git
23:32:30 : current iteration: 479 url: https://github.com/gpii-ops/gpii-infra.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/rajesh-nitc/gcp-foundation to /repositories/367584349


00:10:53 : current iteration: 531 url: https://github.com/datarootsio/terraform-module-azure-datalake.git
00:17:16 : current iteration: 534 url: https://github.com/ministryofjustice/hmpps-env-configs.git
00:23:31 : current iteration: 534 url: https://github.com/ministryofjustice/hmpps-env-configs.git
00:29:44 : current iteration: 534 url: https://github.com/ministryofjustice/hmpps-env-configs.git
00:35:57 : current iteration: 534 url: https://github.com/ministryofjustice/hmpps-env-configs.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
00:42:20 : current iteration: 544 url: https://github.com/hmcts/azure-platform-terraform.git
00:48:38 : current iteration: 544 url: https://github.com/hmcts/azure-platform-terraform.git
00:54:56 : current iteration: 544 url: https://github.com/hmcts/azure-platform-terraform.git
01:01:06 : current iteration: 544 url: https://github.com/hmcts/azure-platform-terraform.git
01:07:14

Following Github server redirection from /repos/ryanlg/ryhino-public to /repositories/421638252


01:13:34 : current iteration: 549 url: https://github.com/aztfmod/terraform-azurerm-caf.git
01:19:52 : current iteration: 549 url: https://github.com/aztfmod/terraform-azurerm-caf.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
01:26:17 : current iteration: 559 url: https://github.com/cloudposse/terraform-spacelift-cloud-infrastructure-automation.git
01:32:33 : current iteration: 562 url: https://github.com/mattermost/mattermost-cloud-monitoring.git
01:38:55 : current iteration: 562 url: https://github.com/mattermost/mattermost-cloud-monitoring.git


Following Github server redirection from /repos/basisai/terraform-modules-gcp to /repositories/167327420


01:45:24 : current iteration: 576 url: https://github.com/kinvolk-archives/lokomotive-kubernetes.git
01:51:47 : current iteration: 586 url: https://github.com/dwp/dataworks-github-config.git
Rate limit exceeded, sleeping for 3.3338563442230225 seconds. Actual remaining calls 9
01:57:59 : current iteration: 589 url: https://github.com/ministryofjustice/cloud-platform-environments.git
02:04:13 : current iteration: 589 url: https://github.com/ministryofjustice/cloud-platform-environments.git
02:10:26 : current iteration: 589 url: https://github.com/ministryofjustice/cloud-platform-environments.git
02:16:39 : current iteration: 589 url: https://github.com/ministryofjustice/cloud-platform-environments.git
02:22:52 : current iteration: 589 url: https://github.com/ministryofjustice/cloud-platform-environments.git
02:29:07 : current iteration: 589 url: https://github.com/ministryofjustice/cloud-platform-environments.git
02:35:21 : current iteration: 589 url: https://github.com/ministryofjustic

Following Github server redirection from /repos/Lemax-Dev/infrastructure-repo to /repositories/230511040


06:51:17 : current iteration: 596 url: https://github.com/alphagov/govwifi-terraform.git
06:57:39 : current iteration: 596 url: https://github.com/alphagov/govwifi-terraform.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/globeandmail/aws-dynamodb to /repositories/231636188
Following Github server redirection from /repos/ibm-hcbt/acct-config-iam to /repositories/302706624


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
07:04:26 : current iteration: 630 url: https://github.com/fluent-labs/infrastructure.git


Following Github server redirection from /repos/pastis-hosting/net.pastis-hosting.k8s to /repositories/434807874


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
07:10:52 : current iteration: 638 url: https://github.com/nationalarchives/tdr-terraform-backend.git
07:17:08 : current iteration: 640 url: https://github.com/nationalarchives/tdr-terraform-environments.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/matthewbentley/blog-terraform to /repositories/113337906
Following Github server redirection from /repos/palogitjl/terragoat2 to /repositories/441027419


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
07:23:56 : current iteration: 676 url: https://github.com/ministryofjustice/cloud-platform-terraform-concourse.git


Following Github server redirection from /repos/pagopa/userregistry-devops to /repositories/429011484


07:30:35 : current iteration: 684 url: https://github.com/mdn/infra.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/terraform-alicloud-modules/terraform-alicloud-ecp to /repositories/465308959


07:37:15 : current iteration: 698 url: https://github.com/nationalarchives/tdr-jenkins.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
07:43:45 : current iteration: 706 url: https://github.com/binbashar/le-tf-infra-aws.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
07:50:24 : current iteration: 711 url: https://github.com/PaloAltoNetworks/terraform-aws-vmseries-modules.git
07:57:04 : current iteration: 722 url: https://github.com/chris-qa-org/terraform-aws-organzation-and-sso.git


Following Github server redirection from /repos/monish-advani/terragoat-test to /repositories/304530642


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
08:03:45 : current iteration: 740 url: https://github.com/jenkins-x/terraform-aws-eks-jx.git
08:10:37 : current iteration: 743 url: https://github.com/bridgecrewio/terragoat.git
08:17:10 : current iteration: 744 url: https://github.com/Civil-Service-Human-Resources/lpg-terraform-paas.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
08:23:39 : current iteration: 755 url: https://github.com/langri-sha/langri-sha.com.git
08:30:02 : current iteration: 758 url: https://github.com/commitdev/zero-aws-eks-stack.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
08:36:28 : current iteration: 763 url: https

Following Github server redirection from /repos/Quansight/qhub to /repositories/252841745


09:15:53 : current iteration: 776 url: https://github.com/Quansight/qhub.git
09:22:22 : current iteration: 776 url: https://github.com/Quansight/qhub.git


Following Github server redirection from /repos/OmniTeqSource/terraform-aws-iam to /repositories/358685228


09:29:02 : current iteration: 782 url: https://github.com/cloudposse/terraform-aws-s3-bucket.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
09:35:49 : current iteration: 796 url: https://github.com/terraform-google-modules/terraform-google-log-export.git
09:42:38 : current iteration: 816 url: https://github.com/dwp/aws-analytical-env.git
09:49:09 : current iteration: 816 url: https://github.com/dwp/aws-analytical-env.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/traveloka/terraform-aws-common-iam-roles to /repositories/128900479


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
09:56:04 : current iteration: 844 url: https://github.com/Young-ook/terraform-aws-spinnaker.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
10:02:46 : current iteration: 862 url: https://github.com/alphagov/verify-infrastructure.git
10:09:16 : current iteration: 862 url: https://github.com/alphagov/verify-infrastructure.git
10:15:38 : current iteration: 864 url: https://github.com/GoogleCloudPlatform/cloud-foundation-fabric.git
10:22:00 : current iteration: 864 url: https://github.com/GoogleCloudPlatform/cloud-foundation-fabric.git
10:28:25 : current iteration: 864 url: https://github.com/GoogleCloudPlatform/cloud-foundation-fabric.git
10:34:46 : current iteration: 864 url: https://github.com/GoogleCloudPlatform/cloud-foundation-fabric.git
10:41:08 : current iteration: 864 url: https://git

Following Github server redirection from /repos/ckilpatrick20/github-actions to /repositories/265605083


12:06:48 : current iteration: 877 url: https://github.com/wellcometrust/terraform-modules.git
12:13:13 : current iteration: 881 url: https://github.com/minamijoyo/tfupdate-circleci-example.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/GSA/datagov-brokerpak-eks to /repositories/305896342


12:19:49 : current iteration: 901 url: https://github.com/roberthstrand/demo-github-terraform.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/palogitjl/terragoat1 to /repositories/441021321


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
12:26:30 : current iteration: 916 url: https://github.com/segmentio/stack.git
12:33:11 : current iteration: 928 url: https://github.com/cloudposse/terraform-aws-eks-node-group.git
12:39:58 : current iteration: 940 url: https://github.com/cloudposse/terraform-aws-ecs-alb-service-task.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
12:46:58 : current iteration: 965 url: https://github.com/maddevsio/aws-eks-base.git
12:53:37 : current iteration: 974 url: https://github.com/ministryofjustice/analytics-platform-ops.git
13:00:04 : current iteration: 979 url: https://github.com/john-hurringjr/test-environment.git


Following Github server redirection from /repos/bphanbc/terragoat to /repositories/477081171


13:06:51 : current iteration: 999 url: https://github.com/cds-snc/cloud-based-sensor.git
13:13:26 : current iteration: 1009 url: https://github.com/liatrio/lead-terraform.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
13:20:06 : current iteration: 1014 url: https://github.com/devopsacademyau/academy.git
13:26:35 : current iteration: 1014 url: https://github.com/devopsacademyau/academy.git
13:32:53 : current iteration: 1014 url: https://github.com/devopsacademyau/academy.git
13:39:15 : current iteration: 1014 url: https://github.com/devopsacademyau/academy.git
13:45:38 : current iteration: 1014 url: https://github.com/devopsacademyau/academy.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
13:52:14 : current iteration: 1026 url: https://github.com/broadinstitute/terraform-ap-modules.git
exception: 404 {"message": "Not Found", "docume

Following Github server redirection from /repos/dgorbov/terraform-s3-backend-setup to /repositories/230207730


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
16:57:24 : current iteration: 1253 url: https://github.com/plus3it/terraform-aws-tardigrade-s3-bucket.git
17:03:58 : current iteration: 1256 url: https://github.com/ExpediaDotCom/haystack.git
17:10:29 : current iteration: 1259 url: https://github.com/jonpulsifer/terraform-modules.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


In [7]:
# SETTINGS
check_limit_every_x_calls = 5
api_limit_buffer = 10
api_calls_per_debug = 500

# INITIALIZATION

import json
import datetime
import time

# INITIALIZATION
terraform_output = open(STEP4_TFCOMMITS)
step4_output = json.load(terraform_output)

# Retrieve data from previous run
try:
    previous_run = open(STEP5_TF_REPOS_WITH_PR)
    repoData_dict = json.load(previous_run)
except FileNotFoundError as e:
    repoData_dict = []

iteration = 0
calls_till_next_debug = 0
calls_till_limit_checkup = 0

# Check for api limits, also periodically calls print debug.
def CheckForApiLimit():
    global calls_till_limit_checkup
    global calls_till_next_debug
    global api_calls_per_debug
    global api_limit_buffer

    # check for limit
    if (calls_till_limit_checkup == 0):
        core_limit = g.get_rate_limit().core

        # sleep when exceeded api core limit
        if (core_limit.remaining <= api_limit_buffer):
            time_to_sleep = core_limit.raw_data['reset'] - time.time() + 1
            print("Rate limit exceeded, sleeping for", time_to_sleep, "seconds.", "Actual remaining calls", core_limit.remaining)
            time.sleep(time_to_sleep)

        calls_till_limit_checkup = check_limit_every_x_calls
    
    calls_till_limit_checkup -= 1

    # check for debug
    if (calls_till_next_debug == 0):
        PrintDebug()
        calls_till_next_debug = api_calls_per_debug

    calls_till_next_debug -= 1

# Prints debug message
def PrintDebug():
    global iteration
    global repo_url

    print(datetime.datetime.now().strftime("%H:%M:%S"), ":", 
              "current iteration:", iteration, 
              "url:", repo_url)

# Pull request scraping script
for rp in step4_output["repositories"]:
    try:
        iteration += 1
            
        repo_url = rp["name"]

        # skip already scraped repositories
        if any(d["url"] == repo_url for d in repoData_dict):
            continue

        # Get the repo object from the url
        split_list = repo_url.split("/")
        actual_url = (split_list[3]+ '/' + split_list[4]).split('.git')[0]
        repo = g.get_repo(actual_url)
        
        # Get required info for pull requests
        pull_requests_dict = []
        pull_requests = repo.get_pulls(state="closed")

        if pull_requests.totalCount > 0:
            for pr in pull_requests:
    
                # retrieve all review comments, not required if there are none.
                comments = []
            
                for review in pr.get_reviews():
                    if (review.body.strip() != ""):
                        comments.append(review.body)
                CheckForApiLimit()

                if (pr.review_comments > 0):
                    for review_comment in pr.get_review_comments():
                        if (review_comment.body.strip() != ""):
                            comments.append(review_comment.body)
                    CheckForApiLimit()

                if (pr.comments > 0):
                    for comment in pr.get_issue_comments():
                        if (comment.body.strip() != ""):
                            comments.append(comment.body)
                    CheckForApiLimit()
    
                # retrieve all connected commits.
                commits = []
                for commit in pr.get_commits():
                    commits.append(commit.sha)
                CheckForApiLimit()
    
                pull_requests_dict.append({"url": pr.html_url, "title": pr.title, "body": pr.body, "comments": comments, "commits": commits})
            
            CheckForApiLimit()        
            repoData_dict.append({"url": repo_url, "pull_requests": pull_requests_dict});
            
            with open(STEP5_TF_REPOS_WITH_PR, "w") as outfile:
                json.dump(repoData_dict, outfile)
    except Exception as e:
        print("exception:", e)

exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message

Following Github server redirection from /repos/andreas-prinz/gcp-terraform-google-lb to /repositories/425986937


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/lwilliams1990/deepfence-threatmapper-lab to /repositories/270369845


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message

Following Github server redirection from /repos/kmarilleau/a-cloud-guru-gcp-cloud-engineer-terraform to /repositories/323167041


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentatio

Following Github server redirection from /repos/ryanlg/ryhino-public to /repositories/421638252


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/Lemax-Dev/infrastructure-repo to /repositories/230511040


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/globeandmail/aws-dynamodb to /repositories/231636188


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/matthewbentley/blog-terraform to /repositories/113337906


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


Following Github server redirection from /repos/monish-advani/terragoat-test to /repositories/304530642


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message

Following Github server redirection from /repos/dgorbov/terraform-s3-backend-setup to /repositories/230207730


exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


## STEP 6: Find tf commits for tf repos with pr's 

For remaining repositories from step 5, collect all commits that modify a terraform file.

In [14]:
from pydriller import Repository

import json

step5 = open(STEP5_TF_REPOS_WITH_PR)
tf_repositories = json.load(step5)

terraform_keywords = ['.tf', '.tf.json']

iteration = 0
    
# Pull request scraping script
repo_dic = []
for repository in tf_repositories:
    try:
        if (iteration % 50 == 0):
            print("at iteration", iteration)
            with open(STEP6_TF_REPOS_COMMITS, "w") as outfile:
                json.dump(repo_dic, outfile)

        iteration += 1

        # Get each commit
        commit_dic = []
        for commit in Repository(repository["url"]).traverse_commits():

            modified_terraform = False
            # find if it changes a terraform file
            for file in commit.modified_files:
                if any(key in file.filename for key in terraform_keywords):
                    modified_terraform = True
            
            if modified_terraform:
                commit_dic.append({"hash": commit.hash, 
                                   "url": repository["url"].split(".git")[0] + "/commit/" + commit.hash, 
                                   "date": str(commit.author_date), 
                                   "body": commit.msg})
  
        repo_dic.append({"url":repository["url"], "commits":commit_dic})
        
    except Exception as e:
        print("exception:", e)

with open(STEP6_TF_REPOS_COMMITS, "w") as outfile:
        json.dump(repo_dic, outfile)

at iteration 0
at iteration 50
at iteration 100
at iteration 150
at iteration 200
at iteration 250
at iteration 300
at iteration 350
at iteration 400
at iteration 450
at iteration 500
at iteration 550
at iteration 600


## STEP 6b: exclude unrelated tf commits

In [16]:
import json

step6 = open(STEP6_TF_REPOS_COMMITS)
repository_commits = json.load(step6)

labels_file = open(COMMIT_LABELS)
commit_labels = json.load(labels_file)

for repository in repository_commits:
    commits = []
    for commit in repository["commits"]:
        label = commit_labels.get(commit["hash"], None)
        
        if label is None or "unrelated" not in label:
            commits.append(commit)
    repository["commits"] = commits

with open(STEP6A_TF_REPOS_RELEVANT_COMMITS, "w") as outfile:
        json.dump(repository_commits, outfile)

## STEP 7: filter out pull requests without relevant tf commit

Removes any pull request that does not include a commit from the previous step, for the remaining pull requests, it combines the two datasets into one.

In [41]:
import json

step5 = open(STEP5_TF_REPOS_WITH_PR)
repository_input = json.load(step5)

step6a = open(STEP6A_TF_REPOS_RELEVANT_COMMITS)
commit_input = json.load(step6a)

iteration = 0

output_dict = []

# for each repository
for repository in repository_input:
    # find commits for repo from step 6a
    commit_input_list = next(repo["commits"] for repo in commit_input if repo["url"] == repository["url"])

    pr_dict = []
    # for each pull request
    for pull_request in repository["pull_requests"]:
        commit_dict = []

        # for each commit
        for commit_hash in pull_request["commits"]:
            # Find the exact commit from step 7
            commit_data = next((commit for commit in commit_input_list if commit["hash"] == commit_hash), None)
            if (commit_data is not None):
                commit_dict.append(commit_data)

        pull_request["total_commits"] = len(pull_request["commits"])
        pull_request["commits"] = commit_dict
        
        if (len(commit_dict) > 0):
            pr_dict.append(pull_request)
    
    if (len(pr_dict) > 0):
        output_dict.append({"url": repository["url"], "pull_requests": pr_dict})

with open(STEP7_TF_REPOS_WITH_TF_PR, "w") as outfile:
    json.dump(output_dict, outfile) 

## STEP 8: list all tf pull request with a keyword

In [42]:
cost_keywords = ["cheap", "expens", "cost", "efficient", "bill", "pay"]

step7 = open(STEP7_TF_REPOS_WITH_TF_PR)
repo_input = json.load(step7)

pullrequest_dict_output = []
for repository in repo_input:
    for pr in repository["pull_requests"]:
        
        title   = True if (pr["title"]        is not None and any(key in pr["title"].lower()    for key in cost_keywords)) else False
        body    = True if (pr["body"]         is not None and any(key in pr["body"].lower()     for key in cost_keywords)) else False
        comment = True if (any(comment        is not None and     key in comment.lower()        for key in cost_keywords for comment in pr["comments"])) else False
        commit  = True if (any(commit["body"] is not None and     key in commit["body"].lower() for key in cost_keywords for commit  in pr["commits"]))  else False
            
        reason = (("title " if title else "") + 
                  ("body " if body else "") + 
                  ("comment " if comment else "") + 
                  ("commit " if commit else ""))
        
        if (title or body or comment or commit):
            pullrequest_dict_output.append({"reason": reason.strip(), "pull_request": pr})

with open(STEP8_TF_KEYWORD_PR, "w") as outfile:
    json.dump(pullrequest_dict_output, outfile) 

## STEP 9: Parse to dataset format

In [44]:
import json

step8 = open(STEP8_TF_KEYWORD_PR)
pull_request_reasons = json.load(step8)

pr_output = []
for pull_request_reason in pull_request_reasons:
    pull_request = pull_request_reason["pull_request"]
    commits = []
    for commit in pull_request["commits"]:
        commits.append(commit["hash"])
    pr_output.append(
        {
            "type": "pull_request", 
            "url": pull_request["url"],
            "content": {
                "title": pull_request["title"],
                "body": pull_request["body"],
                "comments": pull_request["comments"],
                "commits": commits
                },
            "codes": []
        })

with open(STEP9_TF_PR_DATASET, "w") as outfile:
    json.dump(pr_output, outfile) 

## STEP 9: results

In [43]:
import json

cost_keywords = ["cheap", "expens", "cost", "efficient", "bill", "pay"]

step4 = open(STEP4_TFCOMMITS)
step4_data = json.load(step4)

step5 = open(STEP5_TF_REPOS_WITH_PR)
step5_data = json.load(step5)

step7 = open(STEP7_TF_REPOS_WITH_TF_PR)
repo_input = json.load(step7)

step8 = open(STEP8_TF_KEYWORD_PR)
pr_reason_input = json.load(step8)


# GENERAL REPOSITORY DATA

print("Total repositories:" , step4_data["no_of_repos"])
print("Of those that exist and have pull request(s):", len(step5_data))
print("Of those that have relevant TF commits:", len(repo_input))

print("")

# GENERAL PULL REQUEST DATA
print("Total TF PR's with a keyword:", len(pr_reason_input))

print("")

print("PR with keyword in:\t", "Only in:")
print("Title:\t\t", len([pr for pr in pr_reason_input if "title" in pr["reason"]]), "\t", len([pr for pr in pr_reason_input if "title" == pr["reason"]]))
print("Description:\t", len([pr for pr in pr_reason_input if "body" in pr["reason"]]), "\t", len([pr for pr in pr_reason_input if "body" == pr["reason"]]))
print("Comment:\t", len([pr for pr in pr_reason_input if "comment" in pr["reason"]]), "\t", len([pr for pr in pr_reason_input if "comment" == pr["reason"]]))
print("commit message*:", len([pr for pr in pr_reason_input if "commit" in pr["reason"]]), "\t", len([pr for pr in pr_reason_input if "commit" == pr["reason"]]))  

print("")

print("*commits labeled as unrelevant have already been removed from the dataset, while the same is not true for the other locations.")

print("")

# GENERAL COMMIT DATA

print("Total amount of relevant commits in TF PR's:", sum([pr["total_commits"] for repo in repo_input for pr in repo["pull_requests"]]))
print("Of those that modify a TF file:", len([commit for pr_reason in pr_reason_input for commit in pr_reason["pull_request"]["commits"]]))
print("Of those that have a keyword:", len([commit for pr_reason in pr_reason_input for commit in pr_reason["pull_request"]["commits"] if any(key in commit["body"].lower() for key in cost_keywords)]))

print("")

count = 0
for repo in repo_input:
    for pr in repo["pull_requests"]:
        if (pr["total_commits"] >= 250):
            count += 1
print("Amount of PR's with more than 250 commits (limit):", count)
    


Total repositories: 1278
Of those that exist and have pull request(s): 610
Of those that have relevant TF commits: 469

Total TF PR's with a keyword: 814

PR with keyword in:	 Only in:
Title:		 111 	 30
Description:	 363 	 214
Comment:	 354 	 322
commit message*: 194 	 63

*commits labeled as unrelevant have already been removed from the dataset, while the same is not true for the other locations.

Total amount of relevant commits in TF PR's: 130720
Of those that modify a TF file: 3266
Of those that have a keyword: 203

Amount of PR's with more than 250 commits (limit): 34
