<a href="https://colab.research.google.com/github/Ak4nksha/duplicate-bug-detector/blob/main/notebooks/03_model_comparisons.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q rank-bm25 sentence-transformers

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
DATA_ROOT = "/content/drive/MyDrive/DuplicateBugsDetector/cleaned_files"
train_df = pd.read_csv(f"{DATA_ROOT}/train.csv")
test_df  = pd.read_csv(f"{DATA_ROOT}/test.csv")

print(train_df.shape, test_df.shape)
train_df.head(2)

(26183, 13) (6547, 13)


Unnamed: 0,project,issue_id,summary,description,status,priority,resolution,created,resolved,text,len_tokens,dup_group,is_duplicate
0,firefox,1606532,Address bar doesn't elide origins correctly,User Agent: Mozilla/5.0 (Windows NT 10.0; Win6...,resolved,unspecified,duplicate,2020-01-01 05:10:54+00:00,2023-06-06 00:44:25+00:00,Address bar doesn't elide origins correctly\nU...,88,1942560.0,True
1,firefox,1606566,"""TypeError: info.PDFFormatVersion is undefined...",When the PDF version cannot be extracted from ...,verified,unspecified,fixed,2020-01-01 18:26:12+00:00,2020-02-06 10:01:56+00:00,"""TypeError: info.PDFFormatVersion is undefined...",82,,False


In [5]:
# evaluation helper functions

def first_relevant_rank(scores, train_group, target_group):
    sorted_idx = np.argsort(-scores)
    for rank, idx in enumerate(sorted_idx, 1):
        if train_group[idx] == target_group:
            return rank
    return np.inf

def recall_at_k(ranks, k):
    ranks = np.array(ranks)
    return np.mean(ranks <= k)

def mrr(ranks):
    ranks = np.array(ranks)
    return np.mean(1 / ranks)

In [6]:
## Tf-idf baseline for comparision

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    lowercase=True,
    stop_words="english"
)
X_train = tfidf.fit_transform(train_df["text"].fillna(""))

train_ids   = train_df["issue_id"].astype(str).to_numpy()
train_group = train_df["dup_group"]
#print(tfidf.get_params())
print(X_train.shape)

(26183, 84564)


In [7]:
# we evaluate only queries that actually have a duplicate group
qset = test_df[test_df["dup_group"].notna()].copy()
qset = qset.reset_index(drop=True)

X_test = tfidf.transform(qset["text"].fillna(""))

ranks_overall = []
ranks_by_project = {p: [] for p in qset["project"].unique()}

print("Running TF–IDF retrieval for", len(qset), "queries...")

for i, row in qset.iterrows():
    query_vec = X_test[i]
    scores = query_vec.dot(X_train.T).toarray().ravel()

    r = first_relevant_rank(
        scores,
        train_group=train_group,
        target_group=row["dup_group"],
    )

    ranks_overall.append(r)
    ranks_by_project[row["project"]].append(r)


Running TF–IDF retrieval for 1105 queries...


In [8]:
# Summaries for baseline

def summarize(ranks):
    ranks = np.array(ranks)
    return {
        "R@1":  recall_at_k(ranks, 1),
        "R@5":  recall_at_k(ranks, 5),
        "R@10": recall_at_k(ranks, 10),
        "MRR":  mrr(ranks),
    }

overall = summarize(ranks_overall)
project_stats = {p: summarize(r) for p, r in ranks_by_project.items()}

tfidf_metrics = pd.DataFrame(
    [overall] + list(project_stats.values()),
    index=["OVERALL"] + list(project_stats.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nTF–IDF Retrieval Metrics:")
print(tfidf_metrics)


TF–IDF Retrieval Metrics:
              R@1       R@5      R@10       MRR
OVERALL  0.028959  0.054299  0.076018  0.042245
firefox  0.021698  0.045283  0.061321  0.033298
hadoop   0.200000  0.280000  0.440000  0.265634
hbase    0.200000  0.250000  0.400000  0.237232


###BM25 model

In [9]:
# from collections import defaultdict

# def simple_tokenize(text):
#     if not isinstance(text, str):
#         return []
#     return text.lower().split()

# #corpus for BM25
# train_corpus = [simple_tokenize(doc) for doc in train_df["text"].fillna("")]
# bm25 = BM25Okapi(train_corpus)

# qset = test_df[test_df["dup_group"].notna()].copy()
# q_tokens = [simple_tokenize(t) for t in qset["text"].fillna("")]

# ranks_overall_bm25 = []
# ranks_by_project_bm25 = {p: [] for p in qset["project"].unique()}

# print("Running BM25 retrieval for", len(qset), "queries...")

# for (idx, row), query_tokens in zip(qset.iterrows(), q_tokens):
#     scores = bm25.get_scores(query_tokens)       # len = n_train
#     r = first_relevant_rank(
#         np.array(scores),
#         train_group=train_group,
#         target_group=row["dup_group"],
#     )
#     ranks_overall_bm25.append(r)
#     ranks_by_project_bm25[row["project"]].append(r)


In [10]:
# # Summaries for bm25

# overall_bm25 = summarize(ranks_overall_bm25)
# project_bm25 = {p: summarize(r) for p, r in ranks_by_project_bm25.items()}

# bm25_metrics = pd.DataFrame(
#     [overall_bm25] + list(project_bm25.values()),
#     index=["OVERALL"] + list(project_bm25.keys()),
#     columns=["R@1", "R@5", "R@10", "MRR"],
# )

# print("\nBM25 Retrieval Metrics:")
# print(bm25_metrics)


In [11]:
!pip install -q "bm25s[full]"

import bm25s
import numpy as np

corpus = train_df["text"].fillna("").astype(str).tolist()
print("Number of docs in train_df:", len(corpus))

corpus_tokens = [doc.lower().split() for doc in corpus]
print("Number of tokenized docs:", len(corpus_tokens))

bm25_retriever = bm25s.BM25()          # Lucene-style BM25 by default
bm25_retriever.index(corpus_tokens)    # index the tokenized documents

print("BM25S index built on", len(corpus_tokens), "documents.")


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m745.3/745.3 kB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pytrec-eval (setup.py) ... [?25l[?25hdone
Number of docs in train_df: 26183


DEBUG:bm25s:Building index from tokens


Number of tokenized docs: 26183


BM25S Create Vocab:   0%|          | 0/26183 [00:00<?, ?it/s]

BM25S Convert tokens to indices:   0%|          | 0/26183 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/26183 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/26183 [00:00<?, ?it/s]

BM25S index built on 26183 documents.


In [12]:
ranks_overall_bm25 = []
ranks_by_project_bm25 = {p: [] for p in qset["project"].unique()}

print("Running BM25S retrieval for", len(qset), "queries...")

for _, row in qset.iterrows():
    query_text = row["text"] if isinstance(row["text"], str) else ""
    query_tokens = query_text.lower().split()

    scores = np.array(bm25_retriever.get_scores(query_tokens))  # shape: (n_train,)

    r = first_relevant_rank(
        scores,
        train_group=train_group,
        target_group=row["dup_group"],
    )

    ranks_overall_bm25.append(r)
    ranks_by_project_bm25[row["project"]].append(r)

# Summaries for BM25
overall_bm25 = summarize(ranks_overall_bm25)
project_bm25 = {}

for project in ranks_by_project_bm25:
    ranks = ranks_by_project_bm25[project]
    metrics = summarize(ranks)
    project_bm25[project] = metrics

bm25s_metrics = pd.DataFrame(
    [overall_bm25] + list(project_bm25.values()),
    index=["OVERALL"] + list(project_bm25.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nBM25S Retrieval Metrics:")
print(bm25s_metrics)

Running BM25S retrieval for 1105 queries...

BM25S Retrieval Metrics:
              R@1       R@5      R@10       MRR
OVERALL  0.033484  0.064253  0.076018  0.048071
firefox  0.026415  0.053774  0.063208  0.038689
hadoop   0.160000  0.360000  0.400000  0.264595
hbase    0.250000  0.250000  0.350000  0.274691


###SBERT Model

In [13]:
# SBERT model – building embeddings for all train documents

!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
import torch
import numpy as np

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [15]:
sbert_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device=device,
)

# texts for the index
train_texts = train_df["text"].fillna("").astype(str).tolist()

train_embs = sbert_model.encode(
    train_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,   # so cosine similarity = dot product
    show_progress_bar=True,
)

print("Train SBERT embeddings shape:", train_embs.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/410 [00:00<?, ?it/s]

Train SBERT embeddings shape: (26183, 384)


In [16]:
# SBERT retrieval and evaluation

# only queries that actually have a dup_group
print("SBERT: queries with ground truth duplicates:", len(qset))

query_texts = qset["text"].fillna("").astype(str).tolist()

query_embs = sbert_model.encode(
    query_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,   # keep unit vectors
    show_progress_bar=True,
)

ranks_overall_sbert = []
ranks_by_project_sbert = {p: [] for p in qset["project"].unique()}

train_group = train_df["dup_group"].to_numpy()

for i, (_, row) in enumerate(qset.iterrows()):
    q_emb = query_embs[i]
    scores = train_embs @ q_emb    # cosine via dot product

    r = first_relevant_rank(
        scores,
        train_group=train_group,
        target_group=row["dup_group"],
    )

    ranks_overall_sbert.append(r)
    ranks_by_project_sbert[row["project"]].append(r)


overall_sbert = summarize(ranks_overall_sbert)

project_sbert = {}
for project, ranks in ranks_by_project_sbert.items():
    metrics = summarize(ranks)
    project_sbert[project] = metrics

sbert_metrics = pd.DataFrame(
    [overall_sbert] + list(project_sbert.values()),
    index=["OVERALL"] + list(project_sbert.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nSBERT Retrieval Metrics:")
print(sbert_metrics)


SBERT: queries with ground truth duplicates: 1105


Batches:   0%|          | 0/18 [00:00<?, ?it/s]


SBERT Retrieval Metrics:
              R@1       R@5      R@10       MRR
OVERALL  0.057919  0.094118  0.104977  0.074731
firefox  0.043396  0.074528  0.083019  0.058229
hadoop   0.360000  0.600000  0.720000  0.462147
hbase    0.450000  0.500000  0.500000  0.465047


In [17]:
# Combined Comparison Table

overall_tfidf = tfidf_metrics.loc["OVERALL"]
overall_bm25  = bm25s_metrics.loc["OVERALL"]
overall_sbert = sbert_metrics.loc["OVERALL"]

comparison_df = pd.DataFrame({
    "TF-IDF": overall_tfidf,
    "BM25S": overall_bm25,
    "SBERT": overall_sbert
}).T  # transpose so models are rows

comparison_df = comparison_df.round(4)

print("\n Combined Retrieval Performance (Overall Metrics):")
display(comparison_df)



🔎 Combined Retrieval Performance (Overall Metrics):


Unnamed: 0,R@1,R@5,R@10,MRR
TF-IDF,0.029,0.0543,0.076,0.0422
BM25S,0.0335,0.0643,0.076,0.0481
SBERT,0.0579,0.0941,0.105,0.0747
