Import Dependencies

In [1]:
import requests
import time
import json
from unidiff import PatchSet
from phabricator import Phabricator
from datetime import datetime, timedelta
from pprint import pprint
import pandas as pd
import ast
from dotenv import load_dotenv
import os

Set API token and repo URL

In [7]:
load_dotenv(dotenv_path="../../secrets/.env")

API_TOKEN = os.getenv("CONDUIT_API_TOKEN")

BASE_URL = "https://phabricator.services.mozilla.com/api/"

Initialize Phabricator client

In [8]:
phab = Phabricator(token=API_TOKEN, host=BASE_URL)
phab.update_interfaces()
phab.user.whoami()

<Result: {'phid': 'PHID-USER-tkqragx363gqgxck66he', 'userName': 'alal', 'realName': 'Ali', 'image': 'https://mozphab-phabhost-cdn.devsvcprod.mozaws.net/file/data/mphx2gyvrtdt6l3r4hqk/PHID-FILE-i4wttqib4xr2hi2b7nz4/8c9c85-alphanumeric_lato-white_A.png-255%2C255%2C255%2C0.7.png', 'uri': 'https://phabricator.services.mozilla.com/p/alal/', 'roles': ['verified', 'approved', 'activated'], 'primaryEmail': 'a.s.salehi.95@gmail.com'}>

Get all the repositories

In [6]:
operation = "diffusion.repository.search"
API_URL = f"{BASE_URL}/{operation}"

payload = {
    "api.token": API_TOKEN,
}

requests.post(API_URL, data=payload).json()

{'result': None,
 'error_code': 'ERR-INVALID-SESSION',
 'error_info': 'Session key is not present.'}

Get revision for a bug

In [None]:
operation = "differential.revision.search"
API_URL = f"{BASE_URL}/{operation}"

payload = {
    "api.token": API_TOKEN,
    "constraints[query]": "1931487"
}

requests.post(API_URL, data=payload).json()

Get the diff for a revision

In [9]:
operation = "differential.diff.search"
API_URL = f"{BASE_URL}/{operation}"

payload = {
    "api.token": API_TOKEN,
    "constraints[revisionPHIDs][0]": "PHID-DREV-742ecttyirpscb3cma4q"
}

requests.post(API_URL, data=payload).json()

{'result': {'data': [{'id': 946314,
    'type': 'DIFF',
    'phid': 'PHID-DIFF-xh5y22yks5g64gehbgyl',
    'fields': {'revisionPHID': 'PHID-DREV-742ecttyirpscb3cma4q',
     'authorPHID': 'PHID-USER-autti2wxdumepz7idru4',
     'repositoryPHID': 'PHID-REPO-saax4qdxlbbhahhp2kg5',
     'refs': [{'type': 'base',
       'identifier': '00a3e992ba4fe62cba6bcf381b98128da0bc581a'}],
     'dateCreated': 1731794384,
     'dateModified': 1731794649,
     'policy': {'view': 'public'}},
    'attachments': {}},
   {'id': 946305,
    'type': 'DIFF',
    'phid': 'PHID-DIFF-fmurav5p5edly32whqho',
    'fields': {'revisionPHID': 'PHID-DREV-742ecttyirpscb3cma4q',
     'authorPHID': 'PHID-USER-autti2wxdumepz7idru4',
     'repositoryPHID': 'PHID-REPO-saax4qdxlbbhahhp2kg5',
     'refs': [{'type': 'branch', 'name': 'default'},
      {'type': 'base',
       'identifier': '6c508a387477e3b72db913a9e1761e9a433d06a2'}],
     'dateCreated': 1731773378,
     'dateModified': 1731773380,
     'policy': {'view': 'public'}

Get raw diff and format it

In [None]:
operation = "differential.getrawdiff"
API_URL = f"{BASE_URL}/{operation}"

payload = {
    "api.token": API_TOKEN,
    "diffID": "946305"
}

diff = requests.post(API_URL, data=payload).json()

response = phab.differential.getrawdiff(diffID='946305')

print(response.response)

patch = PatchSet(diff["result"])

for patched_file in patch:
    print(f"ðŸ“„ {patched_file.path}")
    for hunk in patched_file:
        print(f"  ðŸ”¢ Hunk: {hunk.source_start}:{hunk.source_length} â†’ {hunk.target_start}:{hunk.target_length}")
        for line in hunk:
            if line.is_added:
                print(f"    âž• {line.value.strip()}")
            elif line.is_removed:
                print(f"    âž– {line.value.strip()}")


Get commmit message for a diff

In [11]:
response = phab.diffusion.commit.search(
    constraints={
        'ids': [946314]
    }
)

pprint(response.response)

{'cursor': {'after': None, 'before': None, 'limit': 100, 'order': None},
 'data': [{'attachments': {},
           'fields': {'auditStatus': {'closed': True,
                                      'color.ansi': None,
                                      'name': 'No Audits',
                                      'value': 'none'},
                      'author': {'email': 'philringnalda@gmail.com',
                                 'epoch': None,
                                 'identityPHID': 'PHID-RIDT-kpxu5wfnlpt3cu33kmi3',
                                 'name': 'Phil Ringnalda',
                                 'raw': 'Phil Ringnalda '
                                        '<philringnalda@gmail.com>',
                                 'userPHID': 'PHID-USER-sbnc5ih3p7zgvgpizobs'},
                      'committer': {'email': None,
                                    'epoch': 1504763335,
                                    'identityPHID': None,
                                    'n

- Get all the differential revisions (DREV) from last year until now
- For each bug id from bugs_with_added_regressor_info.csv fetch its corresponding differential revision
- For each DREV, get its diff and attach it as a new column to its corresponding bug id from the bugs_with_added_regressor_info.csv

In [None]:
MOZILLA_CENTRAL_PHID = "PHID-REPO-saax4qdxlbbhahhp2kg5"
LIMIT = 100 # page size

# Time range: last 7 days
now = datetime.now()
start_time = now - timedelta(days=365)
start_epoch = int(start_time.timestamp())

all_revisions_list = []
after_cursor = None

while True:

    response = phab.differential.revision.search(
        constraints={
            'createdStart':start_epoch,
            'repositoryPHIDs':[MOZILLA_CENTRAL_PHID]
        },
        order= 'oldest',
        limit= LIMIT,
        after= after_cursor
    )

    data = response['data']
    all_revisions_list.extend(data)

    cursor = response['cursor']
    if cursor['after'] is None:
        break
    after_cursor = cursor['after']


# for rev in all_revisions_list:
#     created_time = datetime.fromtimestamp(rev['fields']['dateCreated']).strftime('%Y-%m-%d %H:%M:%S')
#     print(f"ID: {rev['id']}, PHID: {rev['phid']}, Title: {rev['fields']['title']}, Created: {created_time}")

all_revisions_df = pd.DataFrame(all_revisions_list)
all_revisions_df.to_csv("../datasets/all_differential_revisions.csv", index=False)

In [None]:
all_revisions_df = pd.read_csv("../datasets/all_differential_revisions.csv")
all_revisions_df['fields'] = all_revisions_df['fields'].apply(ast.literal_eval)
all_revisions_list = all_revisions_df.to_dict(orient='records')


bugs_df = pd.read_csv("../datasets/bugs_with_added_regressor_info.csv")
bugs_df['regressed_perf_tests'] = bugs_df['regressed_perf_tests'].apply(ast.literal_eval)
bugs_list = bugs_df.to_dict(orient='records')


# transform to dict for faster lookup
all_revisions_dict = {}

for drev in all_revisions_list:
    drev_bugzilla_bug_id = drev['fields']['bugzilla.bug-id']

    if drev_bugzilla_bug_id:
        all_revisions_dict[int(drev_bugzilla_bug_id)] = drev

bugs_with_drev_list = []

for bug in bugs_list:
    bug_with_drev = bug

    bugzilla_bug_id = bug['bug_id']
    drev_for_bug = all_revisions_dict.get(bugzilla_bug_id)

    bug_with_drev['drev_id'] = None
    bug_with_drev['diff_id'] = None

    if drev_for_bug:
        bug_with_drev['drev_id'] = drev_for_bug['id']
        bug_with_drev['diff_id'] = drev_for_bug['fields']['diffID']

    bugs_with_drev_list.append(bug_with_drev)

bugs_with_drev_df = pd.DataFrame(bugs_with_drev_list)
bugs_with_drev_df.to_csv("../datasets/bugs_with_drev.csv", index=False)


bugs_with_diff_list = []

for bug in bugs_with_drev_list:
    bug_diff_id = bug.get('diff_id')

    if bug_diff_id:

        bug_with_diff = bug

        raw_diff = phab.differential.getrawdiff(diffID = str(bug_diff_id)).response

        bug['raw_diff'] = raw_diff
        bugs_with_diff_list.append(bug_with_diff)


bugs_with_drev_df = pd.DataFrame(bugs_with_diff_list)

bugs_with_drev_df.to_json("../datasets/bugs_with_diff.jsonl", orient="records", lines=True)