# Mozilla Treeherder API

Import dependencies

In [None]:
import requests
from pprint import pprint
import datetime
from thclient import TreeherderClient
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time
import ast

Initialize the Treeherder Client

In [None]:
client = TreeherderClient()

Get a single job

In [None]:
job_params = {
    "id":492088703
}

client.get_jobs("mozilla-central", **job_params)

Get performance summaries for a specific period and specific infrastructure

In [None]:
performance_summary_params = {
    "repository": "mozilla-central",
    "signature":308858,
    "interval":2592000,
    "all_data":True,
    "replicates":False
}

data_list = client._get_json("performance/summary", **performance_summary_params)
data_dict = data_list[0]
for key in data_dict:
    print(key, " -> ", data_dict[key])
    if key == "data":
        jobs_list = data_dict[key]

Get all performance test frameworks

In [None]:
client._get_json("performance/framework")

Get performance test framework by id

In [None]:
performance_framework_id = 1
performance_framework_endpoint = f"performance/framework/{performance_framework_id}"

client._get_json(performance_framework_endpoint)

Get all repositories

In [None]:
repos_list = client._get_json("repository")
for repo in repos_list:
  print(repo['name'])

Get all machine platforms

In [None]:
client._get_json("machineplatforms")

Get all machine platforms for a particular branch/project that has performance test data

In [None]:
client._get_json("performance/platforms", "autoland")

Get performance test signatures for a given project

In [None]:
signature_summary_params = {
    # "framework":1,
    # "platform":"linux1804-64-shippable-qr",
    # "id": 308858 # mozilla central
    # "id": 307933 # same test for autoland
}

data_dict = client._get_json("performance/signatures", "autoland", **signature_summary_params)
for key in data_dict:
    print(key, " -> ", data_dict[key])

Get all the jobs/performance test times for a certain signature for the last 30 days for mozilla central

In [None]:
performance_summary_params = {
    "repository": "mozilla-central",
    "signature":308858,
    "interval":2592000,
    "all_data":True,
    "replicates":False
}

data_list = client._get_json("performance/summary", **performance_summary_params)
data_dict = data_list[0]
jobs_list = data_dict['data']
for index, job in enumerate(jobs_list):
  job_params = {
      "id":job["job_id"]
  }
  single_job_list = client.get_jobs("mozilla-central", **job_params)
  job_dict = single_job_list[0]
  submit_time = datetime.fromtimestamp(job_dict['submit_timestamp'])
  start_time = datetime.fromtimestamp(job_dict['start_timestamp'])
  end_time = datetime.fromtimestamp(job_dict['end_timestamp'])
  duration = end_time - start_time
  wait_time = end_time - submit_time

  if index == 0:
    prev_submit_time = submit_time

  submit_time_diff = submit_time - prev_submit_time
  prev_submit_time = submit_time


  print("id: ", job_dict['id'],
        " -> ",
        "   submit time: ", submit_time,
        "   start time: ", start_time,
        "   end time: ", end_time,
        "   duration: ", duration,
        "   wait time: ", wait_time,
        "   submit time difference: ", submit_time_diff)


Get all the jobs/performance test times for a certain signature for the last 30 days for autoland

In [None]:
performance_summary_params = {
    "repository": "autoland",
    "signature":307933,
    "interval":2592000,
    "all_data":True,
    "replicates":False
}

data_list = client._get_json("performance/summary", **performance_summary_params)
data_dict = data_list[0]
jobs_list = data_dict['data']
for index, job in enumerate(jobs_list):
  job_params = {
      "id":job["job_id"]
  }
  single_job_list = client.get_jobs("autoland", **job_params)
  job_dict = single_job_list[0]
  submit_time = datetime.fromtimestamp(job_dict['submit_timestamp'])
  start_time = datetime.fromtimestamp(job_dict['start_timestamp'])
  end_time = datetime.fromtimestamp(job_dict['end_timestamp'])
  duration = end_time - start_time
  wait_time = end_time - submit_time

  if index == 0:
    prev_submit_time = submit_time

  submit_time_diff = submit_time - prev_submit_time
  prev_submit_time = submit_time


  print("id: ", job_dict['id'],
        " -> ",
        "   submit time: ", submit_time,
        "   start time: ", start_time,
        "   end time: ", end_time,
        "   duration: ", duration,
        "   wait time: ", wait_time,
        "   submit time difference: ", submit_time_diff)

Get all performance alerts

In [None]:
client._get_json("performance/alert")

Extract all the alert summaries for a specific siganture

In [None]:
alert_summary_params = {
    "alerts__series_signature": 5095204, # autoland
    "timerange": 31536000 # last year
}

client._get_json("performance/alertsummary", **alert_summary_params)

Extract all the alert summaries for a specific siganture.
Extract all the performance summaries, i.e. performance tests/jobs for a specific signature.
Merge the two data frames on push_id so that we can determine if a test has lead to a performance alert

In [None]:
# ### Creating jobs' dataframe

performance_summary_params = {
    "repository": "autoland",
    "signature":5095204,
    "interval":31536000,
    "all_data":True,
    "replicates":False
}

performance_summaries_list = client._get_json("performance/summary", **performance_summary_params)
performance_summaries_dict = performance_summaries_list[0]
jobs_list = performance_summaries_dict['data']

job_push_ids_list = []
job_ids_list = []

for index, job in enumerate(jobs_list):
  if job["job_id"]:
    job_push_ids_list.append(job["push_id"])
    job_ids_list.append(job["job_id"])

jobs_df = pd.DataFrame({'job_id': job_ids_list, 'job_push_id': job_push_ids_list})

### Creating alerts' dataframe

alert_summary_params = {
    "alerts__series_signature": 5095204, # autoland
    "timerange": 31536000 # last year
}

alert_summaries_response_dict = client._get_json("performance/alertsummary", **alert_summary_params)
alert_summaries_list = alert_summaries_response_dict["results"]

alert_ids_list = []
alert_push_ids_list = []
alert_bug_ids_list = []

for alert_summary in alert_summaries_list:
  for alert in alert_summary["alerts"]:    
    alert_ids_list.append(alert["id"])
    alert_push_ids_list.append(alert_summary["push_id"])
    alert_bug_ids_list.append(alert_summary.get("bug_number"))
    

bug_ids_list_without_none = [id for id in alert_bug_ids_list if id is not None]

alerts_df = pd.DataFrame({'alert_id': alert_ids_list, 'alert_push_id': alert_push_ids_list, 'alert_bug_id': alert_bug_ids_list})

pprint(alerts_df)
pprint(bug_ids_list_without_none)
    
# Merge on alert_push_id and job_push_id
jobs_perf_regression_info_df = pd.merge(jobs_df, alerts_df, left_on='job_push_id', right_on='alert_push_id', how='left')

# Drop duplicate job_push_id column
jobs_perf_regression_info_df.drop(columns=['alert_push_id'], inplace=True)

pprint(jobs_perf_regression_info_df)


for index, row in jobs_perf_regression_info_df.iterrows():
  job_params = {
      "id":row["job_id"].astype(int)
  }

  single_job_list = client.get_jobs("autoland", **job_params)

  job_dict = single_job_list[0]
  submit_time = datetime.fromtimestamp(job_dict['submit_timestamp'])
  start_time = datetime.fromtimestamp(job_dict['start_timestamp'])
  end_time = datetime.fromtimestamp(job_dict['end_timestamp'])
  duration = end_time - start_time
  wait_time = end_time - submit_time

  if index == 0:
    prev_submit_time = submit_time

  submit_time_diff = submit_time - prev_submit_time
  prev_submit_time = submit_time


  print("id: ", job_dict['id'],
        " -> ",
        "   submit time: ", submit_time,
        "   wait time: ", wait_time,
        "   alert id: ", row['alert_id'],
        "   perf bug id: ", row['alert_bug_id'])


Categorize alerts based on their status

In [None]:
alert_summary_params = {
    "alerts__series_signature": 5095204, # autoland
    "timerange": 31536000 # last year
}

alert_summary_status_dict = {
  0: "untriaged",
  1: "downstream",
  2: "reassigned",
  3: "invalid",
  4: "improvement",
  5: "investigating",
  6: "wontfix",
  7: "fixed",
  8: "backedout"
}

alert_status_dict = {
  0: "untriaged",
  1: "downstream",
  2: "reassigned",
  3: "invalid",
  4: "acknowledged"
}

alert_summaries_response_dict = client._get_json("performance/alertsummary", **alert_summary_params)
alert_summaries_list = alert_summaries_response_dict["results"]

alert_ids_list = []
alert_status_list = []

for alert_summary in alert_summaries_list:
  for alert in alert_summary["alerts"]:    
    alert_ids_list.append(alert["id"])

    alert_status_number = alert["status"]
    alert_status = alert_status_dict[alert_status_number]

    alert_status_number_from_summary = alert_summary["status"]
    alert_status_from_summary = alert_summary_status_dict[alert_status_number_from_summary]

    alert_status_list.append(alert_status_dict[alert_status_number])

status_counts = Counter(alert_status_list)

pprint(status_counts)

- Get all the perf alert summaries for the last year
- For each alert summary, get the perf tests that detected a regression
- remove alerts summaries that are not regressions or are invalid
- Get all the bugs from relevant alert summaries
- make a csv file from these bugs and name it regressions.csv

In [None]:
TIMESPAN_IN_DAYS = 365
COLUMNS = ["regression bug id"]

ALERT_SUMMARY_STATUS_DICT = {
  "untriaged": 0,
  "downstream": 1,
  "reassigned": 2,
  "invalid": 3,
  "improvement": 4,
  "investigating": 5,
  "wontfix": 6,
  "fixed": 7,
  "backedout": 8
}

INCLUDED_ALERT_SUMMARY_STATUSES = {
    ALERT_SUMMARY_STATUS_DICT['wontfix'],
    ALERT_SUMMARY_STATUS_DICT['fixed'],
    ALERT_SUMMARY_STATUS_DICT['backedout']
}

alert_summary_params = {
    "page": 1
}

now = datetime.now()
threshold_time = now - relativedelta(days=TIMESPAN_IN_DAYS)

alert_push_time = now
uri = "performance/alertsummary"

alert_summaries_list = []

# get alert summaries
while (alert_push_time >= threshold_time):

    alert_summaries_response_dict = client._get_json(uri, **alert_summary_params)
    alert_summaries_list.extend(alert_summaries_response_dict["results"])

    next_url = alert_summaries_response_dict['next']
    next_page = next_url.split('page=')[1]
    alert_summary_params['page'] = next_page

    alert_push_time_epoch = alert_summaries_response_dict['results'][-1]['push_timestamp']
    alert_push_time = datetime.fromtimestamp(alert_push_time_epoch)

    time.sleep(0.5)

# print("alert summaries:\n")
# pprint(alert_summaries_list)
# print("\n")

alert_summaries_df = pd.DataFrame(alert_summaries_list)
alert_summaries_df.to_csv("../datasets/alert_summaries.csv", index=False)


In [None]:
alert_summaries_df = pd.read_csv("../datasets/alert_summaries.csv")

alert_summaries_df['alerts'] = alert_summaries_df['alerts'].apply(ast.literal_eval)
alert_summaries_df['related_alerts'] = alert_summaries_df['related_alerts'].apply(ast.literal_eval)
alert_summaries_df['bug_number'] = alert_summaries_df['bug_number'].astype('Int64')

alert_summaries_list = alert_summaries_df.to_dict(orient='records')

filtered_alert_summaries_list = []

# filter alert summaries to only include regressions    
for alert_summary in alert_summaries_list:
    if alert_summary['status'] not in INCLUDED_ALERT_SUMMARY_STATUSES:
            continue

    filtered_alert_summaries_list.append(alert_summary)

# print("filtered_alert_summaries_list:\n")
# pprint(filtered_alert_summaries_list)
# print("\n")

# add relevant perf tests to alert summaries
alert_summaries_with_added_info_list = []

for alert_summary in filtered_alert_summaries_list:
            
    single_alerts_list = []
    regression_tests_set = set()

    single_alerts_list.extend(alert_summary['alerts'])
    single_alerts_list.extend(alert_summary['related_alerts'])

    for alert in single_alerts_list:

        if not alert.get('is_regression'):
             continue
        
        alert_test_suite = alert['series_signature'].get('suite')
        alert_single_test = alert['series_signature'].get('test')

        if alert_test_suite:
             regression_tests_set.add(alert_test_suite)

        if alert_single_test:
             regression_tests_set.add(alert_single_test)


    alert_summary['tests_list'] = list(regression_tests_set)

    alert_summaries_with_added_info_list.append(alert_summary)

# print("alert_summaries_with_added_info_list:\n")
# pprint(alert_summaries_with_added_info_list)
# print("\n")


# extract needed columns 
regression_bug_ids_list = []
alert_summary_ids_list = []
regression_tests_list = []

for alert_summary in alert_summaries_with_added_info_list:
     regression_bug_id = alert_summary.get('bug_number')
     if regression_bug_id:
          regression_bug_ids_list.append(regression_bug_id)
          alert_summary_ids_list.append(alert_summary.get("id"))
          regression_tests_list.append(alert_summary.get("tests_list"))
          
# print("regression_bug_ids_list:\n")
# pprint(regression_bug_ids_list)
# print("\n")

regressions_df = pd.DataFrame({'regression_bug_id': regression_bug_ids_list, 
                               'reg_perf_tests_list': regression_tests_list,
                               'perf_reg_alert_summary_id': alert_summary_ids_list})

regressions_df.to_csv('../datasets/regressions.csv', index=False)

