In [1]:
import random
import requests
import json
from tqdm import tqdm
import numpy as np
import os

In [None]:
pull_request_url = lambda owner, repo : f"https://api.github.com/repos/{owner}/{repo}/pulls"

#replace with valid github tokens
github_tokens = [
    "token_1",
    "token_2",
    "token_3"
]

token_index = -1
def request_headers():
    global token_index
    token_index = (token_index + 1) % 3
    return {"Authorization": f"token {github_tokens[token_index]}"}

page = 1
def request_params(page):
    return {"page": page, "per_page": 100}

In [3]:
# load 
with open(f'sstub_projects.json', 'r', encoding='utf-8-sig') as file:
    sstub_repos = json.load(file)

In [4]:
def augment_data(url):
    url = url + "?state=closed"
    # Get Pull Requests from repo
    page = 1
    augmented_prs = []
    to_remove = set()
    while(True):
        os.system('cls')
        print("page " + str(page))
        # Get Pull Requests 100 at a time
        response = requests.get(url, headers = request_headers(), params = request_params(page))
        if response.status_code != 200:
            print(f"Failed to fetch PRs: {response.status_code}")
            break
    
        pull_requests = response.json()
        if not pull_requests:
            break

        # Add PRs to augmented array
        augmented_prs = augmented_prs + [{
            'url': pr['url'],
            'commitSHAs': [],
            'linesAdded': -1,
            'linesRemoved': -1,
            'linesChanged': -1,
            'filesChanged': -1,
            'sstubs': []
        } for pr in pull_requests if pr['merged_at']]

        page += 1
        
    
    for i, pr in enumerate(tqdm(augmented_prs)):

        # Get pull request data from pr_url
        response = requests.get(pr['url'], headers = request_headers())
        if response.status_code != 200:
            print(f"Failed to fetch PRs: {response.status_code}")
            return
    
        pull_request = response.json()
    
        pr['linesAdded'] = pull_request['additions']
        pr['linesRemoved'] = pull_request['deletions']
        pr['linesChanged'] = pull_request['additions'] + pull_request['deletions']
        pr['filesChanged'] = pull_request['changed_files']
    
        # Get commits from pull request
        
        # If only one commit, get from head
        if (pull_request['commits'] == 1):
            pr['commitSHAs'].append(pull_request['head']['sha'])
            continue

        # Else make request for commits
        response = requests.get(pull_request['commits_url'], headers = request_headers())
        if response.status_code != 200:
            print(f"Failed to fetch PRs: {response.status_code}")
            return
    
        commits = response.json()
    
        for commit in commits:
            pr['commitSHAs'].append(commit['sha'])
        
    return augmented_prs

In [5]:
try:
    with open("augmented_dataset.json", "r") as file:
        augmented_data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    augmented_data = []

In [9]:
print(len(augmented_data))

34062


In [16]:
print(sstub_repos[list(sstub_repos.keys())[9]]['github'][0])

https://github.com/alibaba/dubbo


In [13]:
repo_url = sstub_repos[list(sstub_repos.keys())[8]]['github'][0]
owner, repo = repo_url.split('/')[-2:]
augmented_data = augmented_data + augment_data(pull_request_url(owner, repo))

page 1
page 2


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.85it/s]


In [14]:
print(augmented_data[-1])

{'url': 'https://api.github.com/repos/Trinea/android-common/pulls/1', 'commitSHAs': ['e566465fa6dcd73078ab10c0e105a30629cd4952', '8e9d39ae42e067d1c5a26875bfe33d0b2dee7a28', '200563eec362efcea3a84302e8e0632598a568ef', '86d391c0eed9f2086d3047aa93e9af9b58f9f532', '71d9d350cbdc61a25a338839d8ba73f732090854', '2e8d85853eee1666603421aecbdba2839ed30a13', '7415b447e6a6dae13dc060da6190d072c5aee71f'], 'linesAdded': 282, 'linesRemoved': 0, 'linesChanged': 282, 'filesChanged': 2, 'sstubs': []}


In [15]:
with open("augmented_dataset.json", "w") as file:
    json.dump(augmented_data, file, indent=4)

In [23]:
for repo_name in tqdm(sstub_repos):
    repo_url = sstub_repos[repo_name]['github'][0]
    owner, repo = repo_url.split('/')[-2:]
    augmented_data = augment_data(pull_request_url(owner, repo))

100%|███████████████████████████████████████████████████████████████████████████████| 82/82 [00:00<00:00, 81539.34it/s]

https://api.github.com/repos/checkstyle/checkstyle/pulls
https://api.github.com/repos/knightliao/disconf/pulls
https://api.github.com/repos/NLPchina/ansj_seg/pulls
https://api.github.com/repos/wildfly/wildfly/pulls
https://api.github.com/repos/antlr/antlr4/pulls
https://api.github.com/repos/alibaba/druid/pulls
https://api.github.com/repos/hankcs/HanLP/pulls
https://api.github.com/repos/facebook/presto/pulls
https://api.github.com/repos/Trinea/android-common/pulls
https://api.github.com/repos/alibaba/dubbo/pulls
https://api.github.com/repos/springside/springside4/pulls
https://api.github.com/repos/cucumber/cucumber-jvm/pulls
https://api.github.com/repos/square/retrofit/pulls
https://api.github.com/repos/JakeWharton/NineOldAndroids/pulls
https://api.github.com/repos/google/guava/pulls
https://api.github.com/repos/pxb1988/dex2jar/pulls
https://api.github.com/repos/yasserg/crawler4j/pulls
https://api.github.com/repos/clojure/clojure/pulls
https://api.github.com/repos/brianfrankcooper/YCSB/


