# Pull request scraper

## GitHub credentials
A private access token is necessary to make use of less restrictive API limits.

In [9]:
from github import RateLimitExceededException, Github

# Providing access token
access_token = ""
g = Github(login_or_token=access_token)

# Confirm your login is successful
user = g.get_user()
print(f"Authenticated as: {user.login}")

Authenticated as: AbelvdTil


## Files

In [12]:
import os
 
STEP4_TFCOMMITS = os.path.join("data", "step4-tf-commits.json") 
STEP5_TF_PULLREQUESTS = os.path.join("data", "step5-tf-pullrequests.json")

## Scraping script

Also includes settings, initialization and helper functions.

In [14]:
# SETTINGS
ms_time_between_api_calls = 100
check_limit_every_x_calls = 100
api_calls_per_debug = 1000

# INITIALIZATION

import json
import datetime
import time
import pytz
from pydriller import Repository


# INITIALIZATION
terraform_output = open(STEP4_TFCOMMITS)
step4_output = json.load(terraform_output)

# Retrieve data from previous run
try:
    previous_run = open(STEP5_TF_PULLREQUESTS)
    repoData_dict = json.load(previous_run)
except FileNotFoundError as e:
    repoData_dict = []

iteration = 0
calls_till_next_debug = 0
calls_till_limit_checkup = 0

# Check for api limits, also periodically calls print debug.
def CheckForApiLimit():
    global calls_till_limit_checkup
    global calls_till_next_debug
    global api_calls_per_debug

    # check for limit
    if (calls_till_limit_checkup == 0):
        core_limit = g.get_rate_limit().core

        # sleep when exceeded api core limit
        if (core_limit.remaining <= check_limit_every_x_calls):
            time_to_sleep = core_limit.raw_data['reset'] - time.time() + 1
            print("Rate limit exceeded, sleeping for", time_to_sleep, "seconds.", "Actual remaining calls", core_limit.remaining)
            time.sleep(time_to_sleep)

        calls_till_limit_checkup = check_limit_every_x_calls
    
    calls_till_limit_checkup -= 1

    # check for debug
    if (calls_till_next_debug == 0):
        PrintDebug()
        calls_till_next_debug = api_calls_per_debug

    calls_till_next_debug -= 1

# Prints debug message
def PrintDebug():
    global iteration
    global repo_url

    print(datetime.datetime.now().strftime("%H:%M:%S"), ":", 
              "current iteration:", iteration, 
              "url:", repo_url)

# Pull request scraping script
for rp in step4_output["repositories"]:
    try:
        iteration += 1

        if (iteration > 10):
            break
            
        repo_url = rp["name"]

        # skip already scraped repositories
        if any(d["url"] == repo_url for d in repoData_dict):

            continue

        # Get the repo object from the url
        split_list = repo_url.split("/")
        actual_url = (split_list[3]+ '/' + split_list[4]).split('.git')[0]
        repo = g.get_repo(actual_url)
        
        # Get required info for pull requests
        pull_requests_dict = []
        for pr in repo.get_pulls(state="closed"):

            # retrieve all review comments, not required if there are none.
            comments = []
            if (pr.review_comments > 0):
                for review in pr.get_reviews():
                    if (review.body.strip() != ""):
                        comments.append(review.body)
                CheckForApiLimit()

            # retrieve all connected commits.
            commits = []
            for commit in pr.get_commits():
                commits.append(commit.sha)
            CheckForApiLimit()

            pull_requests_dict.append({"url": pr.html_url, "title": pr.title, "body": pr.body, "comments": comments, "commits": commits})
        
        CheckForApiLimit()        
        repoData_dict.append({"url": repo_url, "pull_requests": pull_requests_dict});
        
        with open(STEP5_TF_PULLREQUESTS, "w") as outfile:
            json.dump(repoData_dict, outfile)
    except Exception as e:
        print("exception:", e)

12:44:46 : current iteration: 1 url: https://github.com/tkhoa2711/terraform-digitalocean.git
exception: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}


## File seperator

Output of step 5 will likely be too large for github, make sure each file is under 100MB

In [27]:
import json
import math

nr_parts = 5

In [28]:
results = open(STEP5_TF_PULLREQUESTS)
repoData_dict = json.load(results)

size_per_part = math.ceil(len(repoData_dict) / nr_parts) 

part_data = []

i = 0
current_part = 0
for rp in repoData_dict:
    i += 1
    part = math.floor(i / size_per_part)
    if (part == current_part):
        part_data.append(rp)
    else:
        part_data.append(rp)
        with open(STEP5_TF_PULLREQUESTS.split(".")[0] + "-part-" + str(current_part+1) + ".json", "w") as outfile:
            json.dump(part_data, outfile) 
        part_data = []
        current_part = part
    
if part_data != []:
    with open(STEP5_TF_PULLREQUESTS.split(".")[0] + "-part-" + str(current_part+1) + ".json", "w") as outfile:
            json.dump(part_data, outfile) 

## File combinator

Combine the seperate parts back together

In [25]:
repoData_dict = []
for current_part in range(nr_parts):
    part_file = open(STEP5_TF_PULLREQUESTS.split(".")[0] + "-part-" + str(current_part+1) + ".json", "r")
    part_data = json.load(part_file)

    for rp in part_data:
        repoData_dict.append(rp)

with open(STEP5_TF_PULLREQUESTS, "w") as outfile:
            json.dump(repoData_dict, outfile) 

## Reduce to repositories with pull requests

In [26]:
step5 = open(STEP5_TF_PULLREQUESTS)
step5_dict = json.load(step5)

output_dict = []
for rp in step5_dict:
    output_dict.append({"url": rp["url"], "pull_requests": rp["pull_requests"]})

with open(STEP5_TF_PULLREQUESTS, "w") as outfile:
            json.dump(output_dict, outfile) 
