# Conexión y obtención general de reportes

In [1]:
from pymongo import MongoClient
from models import Report
client = MongoClient("localhost", 27017)
db = client['bug_reports_db']
col = db['bug_reports']

In [2]:
col_reports = col.find()
reports = [Report(**report) for report in col_reports]
len(reports), reports[:5]

(4732,
 [<models.Report at 0x7f7b3865ae20>,
  <models.Report at 0x7f7b38261b50>,
  <models.Report at 0x7f7b38261e50>,
  <models.Report at 0x7f7b38261e20>,
  <models.Report at 0x7f7b38261e80>])

In [3]:
# find any report with dupe_of = None
col_masters = col.find({'dupe_of': None})
masters = [Report(**report) for report in col_masters]
len(masters), masters[:5]

(3724,
 [<models.Report at 0x7f7af4659520>,
  <models.Report at 0x7f7af46593a0>,
  <models.Report at 0x7f7af4659370>,
  <models.Report at 0x7f7af4659340>,
  <models.Report at 0x7f7af4659400>])

In [4]:
col_duplicates = col.find({'dupe_of': {'$ne': None}})
duplicates = [Report(**report) for report in col_duplicates]
len(duplicates), duplicates[:5]

(1008,
 [<models.Report at 0x7f7aef469640>,
  <models.Report at 0x7f7aef469880>,
  <models.Report at 0x7f7aef469520>,
  <models.Report at 0x7f7aef469970>,
  <models.Report at 0x7f7aef469850>])

# Generar pares maestro-maestro

In [5]:
from random import choice

master_pairs = []
while len(master_pairs) != 4000:
    pair = (choice(masters), choice(masters))
    if pair not in master_pairs and pair[0] != pair[1]:
        master_pairs.append(pair)

In [6]:
from random import randint
i = randint(0, len(master_pairs)-1)
print(str(master_pairs[i][0]))
print(str(master_pairs[i][1]))
print(len(master_pairs))

Report 1671364 (dupe of None) - Set "Sponsored" label for sponsored top sites in the address bar
Report 1751283 (dupe of None) - Fix computing of age of last publication for Telemetry
4000


# Búsqueda de pares duplicado-maestro

In [7]:
fail = []
duplicate_pairs = []
for report in duplicates:
    try:
        master = Report.get(report.dupe_of) # raises ReportNotFound
        duplicate_pairs.append((master, report))
    except Exception as e: # ReportNotFound
        fail.append(report)

In [8]:
print(f"[!] {len(fail)} reportes sin un maestro en BD ({len(fail) / len(duplicates) * 100:.2f}%)")
print(f"[+] {len(duplicate_pairs)} reportes con un maestro en BD ({len(duplicate_pairs) / len(duplicates) * 100:.2f}%)")
print(f"[+] {len(duplicate_pairs)+len(fail)}/{len(duplicates)} reportes duplicados analizados")

[!] 41 reportes sin un maestro en BD (4.07%)
[+] 967 reportes con un maestro en BD (95.93%)
[+] 1008/1008 reportes duplicados analizados


## Comprobar duplicados sin maestro aparente

Lo más probable es que sean un duplicado de un duplicado.

De esta forma, el dupe_of señala a otro reporte con un dupe_of que señala al verdadero maestro (puede tener más iteraciones)

In [9]:
from scraper.bugzilla_scraper import BugzillaBaseScraper

bugzilla = BugzillaBaseScraper()

scraped_masters = [] # (duplicado_sin_maestro_en_bd, maestro_scrapeado)
for report in fail:
    master = bugzilla.search_bug(report.dupe_of)
    scraped_masters.append((report, master))



In [10]:
new_pairs = []
for duplicate, master in scraped_masters:
    try:
        if master['dupe_of'] != None:
            master_db = Report.get(master['dupe_of'])
            new_pairs.append((master_db, duplicate))
    except Exception as e: # ReportNotFound
        print(f"[!] El reporte maestro {master['id']} del duplicado {duplicate.report_id} no está en Base de datos y debería scrapearse")

print(f"{len(new_pairs)} reportes con un maestro en BD ({len(new_pairs) / len(scraped_masters) * 100:.2f}% de los reportes a analizar)")

41 reportes con un maestro en BD (100.00% de los reportes a analizar)


In [11]:
duplicate_pairs.extend(new_pairs)
print(f"Pares de maestro-duplicado: {len(duplicate_pairs)}")

Pares de maestro-duplicado: 1008


# Resultados

In [12]:
print(f"Se tienen {len(masters)} reportes maestros")
print(f"Se tienen {len(duplicates)} reportes duplicados")
print(f"Total reportes en BD: {len(reports)}")
print()
print(f"Se obtuvieron {len(master_pairs)} pares maestro-maestro")
print(f"Se obtuvieron {len(duplicate_pairs)} pares maestro-duplicado")
print(f"Total pares: {len(master_pairs)+len(duplicate_pairs)}")

Se tienen 3724 reportes maestros
Se tienen 1008 reportes duplicados
Total reportes en BD: 4732

Se obtuvieron 4000 pares maestro-maestro
Se obtuvieron 1008 pares maestro-duplicado
Total pares: 5008


In [15]:
print(master_pairs[0][0])
print(master_pairs[0][1])

Report 1759231 (dupe of None) - Downloads panel opens when extensions (e.g. Simple Tab Groups) automatically start and delete downloads, without user interaction. (investigate ways to prevent this)
Report 1702526 (dupe of None) - Windows build docs shouldn't specify using curl as it doesn't exist in mozillabuild


In [18]:
print(duplicate_pairs[0][0])
print(duplicate_pairs[0][1])

Report 1707462 (dupe of None) - Default OS downloads directory is listed as "Downloads" in preferences/settings even when that is not the name of the folder, which is confusing
Report 1762510 (dupe of 1707462) - firefox uses home dir for downloads instead of downloads directory
