Lo primero es añadir al path la carpeta root del proyecto para poder acceder a los módulos y ejecutar aun así el notebook desde su carpeta

In [1]:
import sys
sys.path.insert(0, '..')

# Conexión y obtención general de reportes

In [2]:
import controllers as c
from models import Report

In [3]:
reports = c.get_reports(limit=10000)
len(reports), reports[:5]

(10000,
 [<models.Report at 0x7f2a93192d60>,
  <models.Report at 0x7f2a93192d90>,
  <models.Report at 0x7f2a93192e20>,
  <models.Report at 0x7f2a93192eb0>,
  <models.Report at 0x7f2a93192df0>])

In [4]:
masters = [r for r in reports if r.dupe_of is None]
len(masters), masters[:5]

(6699,
 [<models.Report at 0x7f2a93192d60>,
  <models.Report at 0x7f2a93192d90>,
  <models.Report at 0x7f2a93192e20>,
  <models.Report at 0x7f2a93192eb0>,
  <models.Report at 0x7f2a93192f70>])

In [5]:
duplicates = [r for r in reports if r.dupe_of is not None]
len(duplicates), duplicates[:5]

(3301,
 [<models.Report at 0x7f2a93192df0>,
  <models.Report at 0x7f2a93192e50>,
  <models.Report at 0x7f2a93192fa0>,
  <models.Report at 0x7f2a93192f10>,
  <models.Report at 0x7f2a93192f40>])

# Búsqueda de pares duplicado-maestro

In [6]:
N_DUPLICATE_PAIRS = 1000
N_MASTER_PAIRS = 4000

In [7]:
duplicate_pairs = []
master_reports = []
duplicate_reports = []

fail = []

for report in duplicates:
    try:
        master = c.get_report(report.dupe_of)# raises ReportNotFound
        if master not in master_reports:
            master_reports.append(master)
            duplicate_reports.append(report)
            duplicate_pairs.append((master, report))
    except Exception as e: # ReportNotFound
        fail.append(report)

    if len(duplicate_pairs) == N_DUPLICATE_PAIRS:
        break


In [8]:
n_reports_analyzed = len(duplicate_pairs) + len(fail)
print(f"[!] {len(fail)}/{n_reports_analyzed} reportes analizados sin un maestro en BD ({len(fail) / n_reports_analyzed * 100:.2f}%)")
print(f"[+] {len(duplicate_pairs)}/{n_reports_analyzed} reportes analizados con un maestro en BD ({len(duplicate_pairs) / n_reports_analyzed * 100:.2f}%)")
print(f"[+] {n_reports_analyzed}/{len(duplicates)} reportes duplicados analizados hasta completar los pares")

[!] 16/1016 reportes analizados sin un maestro en BD (1.57%)
[+] 1000/1016 reportes analizados con un maestro en BD (98.43%)
[+] 1016/3301 reportes duplicados analizados hasta completar los pares


## Comprobar duplicados sin maestro aparente

Lo más probable es que sean un duplicado de un duplicado.

De esta forma, el dupe_of señala a otro reporte con un dupe_of que señala al verdadero maestro (puede tener más iteraciones)

In [9]:
from scraper.bugzilla_scraper import BugzillaBaseScraper

bugzilla = BugzillaBaseScraper()

scraped_masters = [] # (duplicado_sin_maestro_en_bd, maestro_scrapeado)
for report in fail:
    master = bugzilla.search_bug(report.dupe_of)
    scraped_masters.append((report, master))

In [10]:
new_pairs = []
for duplicate, master in scraped_masters:
    try:
        if master['dupe_of'] != None:
            master_db = c.get_report(master['dupe_of'])
            new_pairs.append((master_db, duplicate))
    except Exception as e: # ReportNotFound
        print(f"[!] El reporte maestro {master['id']} del duplicado {duplicate.report_id} no está en Base de datos y debería scrapearse")

len_new_pairs = len(new_pairs)
try:
    per_new_pairs = f"{((len_new_pairs / len(scraped_masters)) * 100):.2f}"
except:
    per_new_pairs = "0.00"
print(f"{len_new_pairs} reportes con un maestro en BD ({per_new_pairs}% de los reportes a analizar)")

16 reportes con un maestro en BD (100.00% de los reportes a analizar)


# Generar pares maestro-maestro

In [11]:
from random import choice

while len(master_reports) < N_MASTER_PAIRS:
    report = choice(masters)
    if report not in master_reports:
        master_reports.append(report)

In [12]:
master_pairs = []

while len(master_pairs) < N_MASTER_PAIRS:
    pair = (choice(master_reports), choice(master_reports))
    if pair not in master_pairs and pair[0] != pair[1]:
        master_pairs.append(pair)

In [13]:
from random import randint
i = randint(0, len(master_pairs)-1)
print(str(master_pairs[i][0]))
print(str(master_pairs[i][1]))
print(len(master_pairs))

Report 1784648 (dupe of None) - Intermittent browser/components/urlbar/tests/browser/browser_searchMode_engineRemoval.js | single tracking bug
Report 1776019 (dupe of None) - Intermittent browser/base/content/test/popups/browser_popupUI.js | single tracking bug
4000


# Resultados

In [14]:
print(f"Se tienen {len(masters)} reportes maestros")
print(f"Se tienen {len(duplicates)} reportes duplicados")
print(f"Total reportes en BD: {len(reports)}")
print()
print(f"Se obtuvieron {len(master_pairs)} pares maestro-maestro")
print(f"Se obtuvieron {len(duplicate_pairs)} pares maestro-duplicado")
print(f"Total pares: {len(master_pairs)+len(duplicate_pairs)}")

Se tienen 6699 reportes maestros
Se tienen 3301 reportes duplicados
Total reportes en BD: 10000

Se obtuvieron 4000 pares maestro-maestro
Se obtuvieron 1000 pares maestro-duplicado
Total pares: 5000


In [15]:
print(master_pairs[0][0])
print(master_pairs[0][1])

Report 1708492 (dupe of None) - Update FxA avatar icons
Report 1706232 (dupe of None) - Add "Send Tab to Device" toolbar item


In [16]:
print(duplicate_pairs[0][0])
print(duplicate_pairs[0][1])

Report 1693784 (dupe of None) - find toolbar highlight on scrollbar doesn't work on PDFs
Report 1786841 (dupe of 1693784) - PDF.js Reader - missing search scroll hit marks
