In [24]:
import json
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [15]:
dtype_spec = {
    'fixCommitSHA1': str,
    'introducingCommitSHA': str,
    'projectName': str,
    'bugFilePath': str,
    'bugLineNum': 'Int64',  # Allows for missing integers (nullable type)
    'bugType': str,
    'reviewer_count': 'Int64',
    'introducingCommitHasPR': bool,
    'sstub_introduced': 'Int64'
}

df = pd.read_csv("rq1_dataset.csv", dtype=dtype_spec).to_dict(orient='records')

In [16]:
filtered_df = [
    row for row in df
    if row['introducingCommitHasPR'] is True and row['sstub_introduced'] == 1
]

In [18]:
print(len(df))
print(len(filtered_df))

24920
1675


In [30]:
# Load Dataset
with open(f'augmented_dataset.json', 'r', encoding='utf-8-sig') as file:
    dataset = json.load(file)

In [20]:
print(type(df))
print(type(dataset))

<class 'list'>
<class 'list'>


In [23]:
print(filtered_df[0]["introducingCommitSHA"])
print(dataset[0])

7eabea151cf
{'url': 'https://api.github.com/repos/checkstyle/checkstyle/pulls/16617', 'commitSHAs': ['bcbdf0051fddb89969dbac50757e933cb38125a4'], 'linesAdded': 1, 'linesRemoved': 1, 'linesChanged': 2, 'filesChanged': 1, 'sstubs': []}


In [31]:
for pr in tqdm(dataset):
    for sha in pr["commitSHAs"]:
        for sstub in filtered_df:
            if sha.startswith(sstub["introducingCommitSHA"]):
                pr["sstubs"].append({
                    "sha": sha,
                    "bugType": sstub["bugType"]
                })
                print("found one")

 22%|████████████████▋                                                           | 7504/34069 [00:08<00:44, 590.56it/s]

found one
found one
found one
found one


 23%|█████████████████▍                                                          | 7792/34069 [00:08<00:37, 703.90it/s]

found one
found one


 39%|█████████████████████████████▌                                             | 13438/34069 [00:18<00:29, 697.29it/s]

found one
found one


 41%|██████████████████████████████▊                                            | 13992/34069 [00:19<00:27, 732.96it/s]

found one


 43%|████████████████████████████████▏                                          | 14634/34069 [00:19<00:19, 980.54it/s]

found one


 44%|████████████████████████████████▊                                          | 14933/34069 [00:20<00:26, 732.75it/s]

found one
found one
found one
found one
found one
found one
found one
found one


 44%|█████████████████████████████████▏                                         | 15081/34069 [00:20<00:37, 501.46it/s]

found one


 45%|█████████████████████████████████▍                                         | 15210/34069 [00:20<00:35, 538.35it/s]

found one
found one
found one
found one


 45%|██████████████████████████████████                                         | 15455/34069 [00:21<00:39, 475.65it/s]

found one
found one
found one
found one


 46%|██████████████████████████████████▍                                        | 15626/34069 [00:22<00:43, 424.76it/s]

found one
found one


 46%|██████████████████████████████████▊                                        | 15837/34069 [00:22<00:31, 575.36it/s]

found one
found one
found one
found one
found one
found one
found one


 47%|███████████████████████████████████▎                                       | 16048/34069 [00:22<00:22, 794.76it/s]

found one
found one
found one


 47%|███████████████████████████████████▌                                       | 16132/34069 [00:22<00:25, 698.05it/s]

found one
found one
found one


 48%|███████████████████████████████████▉                                       | 16336/34069 [00:23<00:31, 567.39it/s]

found one
found one
found one


 49%|████████████████████████████████████▌                                      | 16604/34069 [00:23<00:23, 757.50it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 49%|████████████████████████████████████▊                                      | 16737/34069 [00:23<00:19, 888.78it/s]

found one
found one
found one
found one
found one
found one
found one
found one


 50%|█████████████████████████████████████▌                                     | 17071/34069 [00:24<00:19, 856.58it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one


 52%|██████████████████████████████████████▋                                    | 17554/34069 [00:24<00:20, 791.61it/s]

found one
found one


 53%|███████████████████████████████████████▌                                   | 17981/34069 [00:25<00:18, 890.83it/s]

found one
found one
found one
found one
found one
found one


 54%|████████████████████████████████████████▌                                  | 18425/34069 [00:25<00:21, 741.64it/s]

found one
found one
found one
found one
found one
found one


 54%|████████████████████████████████████████▋                                  | 18508/34069 [00:26<00:27, 568.02it/s]

found one
found one
found one
found one
found one


 55%|█████████████████████████████████████████▎                                 | 18742/34069 [00:26<00:34, 447.72it/s]

found one


 56%|█████████████████████████████████████████▋                                 | 18931/34069 [00:27<00:39, 381.87it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 61%|█████████████████████████████████████████████▍                             | 20633/34069 [00:32<00:34, 395.06it/s]

found one
found one


 63%|██████████████████████████████████████████████▉                            | 21346/34069 [00:34<00:27, 468.53it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 63%|███████████████████████████████████████████████▌                           | 21611/34069 [00:34<00:19, 647.51it/s]

found one
found one
found one
found one
found one


 64%|███████████████████████████████████████████████▊                           | 21736/34069 [00:35<00:28, 437.71it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 64%|███████████████████████████████████████████████▉                           | 21786/34069 [00:35<00:30, 396.98it/s]

found one
found one


 84%|███████████████████████████████████████████████████████████████            | 28648/34069 [00:43<00:07, 691.77it/s]

found one
found one
found one
found one
found one


 85%|███████████████████████████████████████████████████████████████▋           | 28924/34069 [00:44<00:07, 675.58it/s]

found one


 86%|████████████████████████████████████████████████████████████████▍          | 29248/34069 [00:44<00:06, 734.26it/s]

found one
found one


 87%|█████████████████████████████████████████████████████████████████▌         | 29797/34069 [00:45<00:05, 768.78it/s]

found one
found one
found one
found one
found one
found one


 88%|██████████████████████████████████████████████████████████████████▏        | 30042/34069 [00:45<00:06, 667.59it/s]

found one
found one


 90%|███████████████████████████████████████████████████████████████████▏       | 30546/34069 [00:46<00:05, 618.19it/s]

found one
found one
found one


 90%|███████████████████████████████████████████████████████████████████▊       | 30819/34069 [00:47<00:05, 555.00it/s]

found one


 91%|███████████████████████████████████████████████████████████████████▉       | 30878/34069 [00:47<00:07, 441.90it/s]

found one


 91%|████████████████████████████████████████████████████████████████████▍      | 31092/34069 [00:47<00:05, 549.26it/s]

found one


 92%|████████████████████████████████████████████████████████████████████▊      | 31238/34069 [00:47<00:04, 624.50it/s]

found one
found one
found one
found one


 92%|█████████████████████████████████████████████████████████████████████      | 31377/34069 [00:48<00:04, 607.49it/s]

found one
found one
found one


 92%|█████████████████████████████████████████████████████████████████████▎     | 31496/34069 [00:48<00:05, 481.33it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 93%|█████████████████████████████████████████████████████████████████████▌     | 31597/34069 [00:48<00:04, 592.72it/s]

found one
found one
found one
found one
found one
found one


 93%|█████████████████████████████████████████████████████████████████████▉     | 31756/34069 [00:48<00:03, 643.42it/s]

found one
found one
found one


 94%|██████████████████████████████████████████████████████████████████████▏    | 31887/34069 [00:48<00:03, 599.66it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 94%|██████████████████████████████████████████████████████████████████████▋    | 32130/34069 [00:49<00:02, 677.64it/s]

found one
found one
found one
found one
found one
found one
found one


 95%|███████████████████████████████████████████████████████████████████████▏   | 32341/34069 [00:49<00:02, 841.10it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 96%|███████████████████████████████████████████████████████████████████████▋   | 32590/34069 [00:49<00:01, 889.13it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 96%|████████████████████████████████████████████████████████████████████████▎  | 32825/34069 [00:49<00:01, 945.63it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 97%|████████████████████████████████████████████████████████████████████████▋  | 33009/34069 [00:50<00:01, 798.36it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 97%|████████████████████████████████████████████████████████████████████████▉  | 33159/34069 [00:50<00:01, 474.28it/s]

found one
found one
found one


 98%|█████████████████████████████████████████████████████████████████████████▏ | 33268/34069 [00:51<00:01, 400.75it/s]

found one
found one
found one
found one
found one
found one
found one
found one
found one
found one


 98%|█████████████████████████████████████████████████████████████████████████▎ | 33316/34069 [00:51<00:01, 385.96it/s]

found one


 98%|█████████████████████████████████████████████████████████████████████████▋ | 33446/34069 [00:51<00:01, 342.58it/s]

found one
found one
found one
found one
found one
found one


 99%|█████████████████████████████████████████████████████████████████████████▉ | 33597/34069 [00:52<00:01, 334.22it/s]

found one


 99%|██████████████████████████████████████████████████████████████████████████▎| 33765/34069 [00:52<00:00, 336.61it/s]

found one
found one
found one


100%|███████████████████████████████████████████████████████████████████████████| 34069/34069 [00:53<00:00, 633.28it/s]


In [32]:
with open("updated_dataset.json", "w") as file:
    json.dump(dataset, file, indent=4)

In [29]:
print(dataset[7559])

{'url': 'https://api.github.com/repos/NLPchina/ansj_seg/pulls/692', 'commitSHAs': ['f4c46143d0849434e741b66eebfe54eeeb70255c', 'd04bcfe283dd347f6cd5fab9482308e3a7f1d4f6'], 'linesAdded': 3, 'linesRemoved': 2, 'linesChanged': 5, 'filesChanged': 3, 'sstubs': []}
