<a href="https://colab.research.google.com/github/Ak4nksha/duplicate-bug-detector/blob/main/notebooks/02_ir_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q scikit-learn rank-bm25

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATA_ROOT = "/content/drive/MyDrive/DuplicateBugsDetector/cleaned_files"
train_df = pd.read_csv(f"{DATA_ROOT}/train.csv")
test_df  = pd.read_csv(f"{DATA_ROOT}/test.csv")

print(train_df.shape, test_df.shape)
train_df.head(2)

(26183, 13) (6547, 13)


Unnamed: 0,project,issue_id,summary,description,status,priority,resolution,created,resolved,text,len_tokens,dup_group,is_duplicate
0,firefox,1606532,Address bar doesn't elide origins correctly,User Agent: Mozilla/5.0 (Windows NT 10.0; Win6...,resolved,unspecified,duplicate,2020-01-01 05:10:54+00:00,2023-06-06 00:44:25+00:00,Address bar doesn't elide origins correctly\nU...,88,1942560.0,True
1,firefox,1606566,"""TypeError: info.PDFFormatVersion is undefined...",When the PDF version cannot be extracted from ...,verified,unspecified,fixed,2020-01-01 18:26:12+00:00,2020-02-06 10:01:56+00:00,"""TypeError: info.PDFFormatVersion is undefined...",82,,False


### TF-IDF baseline

In [11]:
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    lowercase=True,
    stop_words="english"
)
X_train = tfidf.fit_transform(train_df["text"].fillna(""))

train_ids   = train_df["issue_id"].astype(str).to_numpy()
train_group = train_df["dup_group"]
#print(tfidf.get_params())
print(X_train.shape)

(26183, 84564)


In [5]:
# evaluation helper functions

def recall_at_k(ranks_arr, k):
    #return np.mean([1.0 if (r is not None and r < k) else 0.0 for r in ranks])
    hits = ranks_arr < k
    return np.mean(hits)

def mrr(ranks_arr):
    scores = np.zeros_like(ranks_arr)
    hits = ~np.isnan(ranks_arr)
    scores[hits] = 1.0 / (ranks_arr[hits] + 1)
    return np.mean(scores)

def first_relevant_rank(sim_scores, candidate_groups, query_group):
    # indices of docs, sorted by score desc
    order = np.argsort(-sim_scores)
    # find first index where group matches query_group
    for r, idx in enumerate(order):
        if candidate_groups.iloc[idx] == query_group:
            return r
    return None


In [6]:

# we evaluate only queries that actually have a duplicate group
qset = test_df[test_df["dup_group"].notna()].copy()
print("Queries with ground truth duplicates:", len(qset))
#print(qset.head())

ranks_overall = []
ranks_by_project = {p: [] for p in qset["project"].unique()}

Queries with ground truth duplicates: 1105


In [13]:
# retrieval loop (cosine via dot product)

for _, row in qset.iterrows():
    qvec = tfidf.transform([row["text"] if isinstance(row["text"], str) else ""])

    similarity_scores = X_train.dot(qvec.T).toarray().ravel()  #(cosine via dot product because TF-IDF is L2-normalized so each vector is unit length)
    #print(similarity_scores.shape)

    r = first_relevant_rank(similarity_scores, train_group, row["dup_group"])
    ranks_overall.append(r)
    ranks_by_project[row["project"]].append(r)


#report metrics
def summarize(ranks, label):
    ranks_arr = np.array(ranks, dtype=float)
    r1  = recall_at_k(ranks_arr, 1)
    r5  = recall_at_k(ranks_arr, 5)
    r10 = recall_at_k(ranks_arr,10)
    mr  = mrr(ranks_arr)

    #print(f"{label:>10}: R@1={r1:.3f}  R@5={r5:.3f}  R@10={r10:.3f}  MRR={mr:.3f}")
    return r1, r5, r10, mr

print("\nTF-IDF Retrieval Metrics:")
overall_metrics = summarize(ranks_overall, "OVERALL")
print()
project_metrics = {p: summarize(ranks, p) for p, ranks in ranks_by_project.items()}


all_metrics = pd.DataFrame(
    [overall_metrics] + list(project_metrics.values()),
    index=["OVERALL"] + list(project_metrics.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"]
)
print(all_metrics)


TF-IDF Retrieval Metrics:

              R@1       R@5      R@10       MRR
OVERALL  0.031483  0.057581  0.080365  0.045221
firefox  0.024957  0.049484  0.067126  0.037174
hadoop   0.200000  0.280000  0.440000  0.265634
hbase    0.200000  0.250000  0.400000  0.237232


In [8]:
# qualitative examples

def topk_ids(sim_scores, k=5):
    if k >= len(sim_scores):
        return np.argsort(-sim_scores)
    idx = np.argpartition(-sim_scores, k)[:k]
    return idx[np.argsort(-sim_scores[idx])]

def show_example(hit=True, k=5):

    # pick a query with/without a hit
    for _, row in qset.sample(frac=1, random_state=0).iterrows():
        qvec = tfidf.transform([row["text"] if isinstance(row["text"], str) else ""])
        similarity_scores = X_train.dot(qvec.T).toarray().ravel()
        order_k = topk_ids(similarity_scores, k)
        good = any(train_group.iloc[i] == row["dup_group"] for i in order_k)
        if good == hit:
            print("\n" + ("HIT" if hit else "MISS"), f"(project={row['project']}, issue={row['issue_id']})")
            print("Q:", row["summary"][:180].replace("\n"," "))
            for rank, i in enumerate(order_k, 1):
                print(f"  {rank:>2}. [{train_df.iloc[i]['project']}] id={train_ids[i]}  "
                      f"{train_df.iloc[i]['summary'][:120].replace('\n',' ')}  "
                      f"{'(->> Matched)' if train_group.iloc[i]==row['dup_group'] else ''}")
            return

In [9]:
show_example(hit=True, k=5)
show_example(hit=False, k=5)


HIT (project=firefox, issue=1909127)
Q: Copy and paste of context menu are sometimes disabled
   1. [firefox] id=1687750  Firefox 84.0.2 will not copy then paste every time.  
   2. [firefox] id=1761544  Removal of Cut/Copy/Paste items from the main hamburger menu breaks functionality on some webpages (on non-macOS)  
   3. [firefox] id=1635106  Context menu on web page isn't recognising elements as editable so cannot paste / have spell suggestions  
   4. [firefox] id=1866650  Copy context menu button disabled  (->> Matched)
   5. [firefox] id=1684312  Error writing selection data: Error writing to file descriptor: Broken pipe  

MISS (project=firefox, issue=1945049)
Q: The Review Checker Panel from the Firefox Sidebar opens automatically when scrolling the same pdp it was closed on
   1. [firefox] id=1858909  Show ads in review checker Setting option is no longer displayed if the Review checker is turned off while checking a pr  
   2. [firefox] id=1868377  Turning the Review Checke