<a href="https://colab.research.google.com/github/Ak4nksha/duplicate-bug-detector/blob/main/notebooks/03_model_comparisons.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"
!pip install -q sentence-transformers
!pip install -q rank-bm25 sentence-transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATA_ROOT = "/content/drive/MyDrive/DuplicateBugsDetector/cleaned_files"
train_df = pd.read_csv(f"{DATA_ROOT}/train.csv")
test_df  = pd.read_csv(f"{DATA_ROOT}/test.csv")

print(train_df.shape, test_df.shape)
train_df.head(2)

In [None]:
# evaluation helper functions

def first_relevant_rank(scores, train_group, target_group):
    sorted_idx = np.argsort(-scores)
    for rank, idx in enumerate(sorted_idx, 1):
        if train_group[idx] == target_group:
            return rank
    return np.inf

def recall_at_k(ranks, k):
    ranks = np.array(ranks)
    return np.mean(ranks <= k)

def mrr(ranks):
    ranks = np.array(ranks)
    return np.mean(1 / ranks)

In [None]:
## Tf-idf baseline for comparision

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    lowercase=True,
    stop_words="english"
)
X_train = tfidf.fit_transform(train_df["text"].fillna(""))

train_ids   = train_df["issue_id"].astype(str).to_numpy()
train_group = train_df["dup_group"]
#print(tfidf.get_params())
print(X_train.shape)

In [None]:
# we evaluate only queries that actually have a duplicate group
qset = test_df[test_df["dup_group"].notna()].copy()
qset = qset.reset_index(drop=True)

X_test = tfidf.transform(qset["text"].fillna(""))

ranks_overall = []
ranks_by_project = {p: [] for p in qset["project"].unique()}

print("Running TF–IDF retrieval for", len(qset), "queries...")

for i, row in qset.iterrows():
    query_vec = X_test[i]
    scores = query_vec.dot(X_train.T).toarray().ravel()

    r = first_relevant_rank(
        scores,
        train_group=train_group,
        target_group=row["dup_group"],
    )

    ranks_overall.append(r)
    ranks_by_project[row["project"]].append(r)


In [None]:
# Summaries for baseline

def summarize(ranks):
    ranks = np.array(ranks)
    return {
        "R@1":  recall_at_k(ranks, 1),
        "R@5":  recall_at_k(ranks, 5),
        "R@10": recall_at_k(ranks, 10),
        "MRR":  mrr(ranks),
    }

overall = summarize(ranks_overall)
project_stats = {p: summarize(r) for p, r in ranks_by_project.items()}

tfidf_metrics = pd.DataFrame(
    [overall] + list(project_stats.values()),
    index=["OVERALL"] + list(project_stats.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nTF–IDF Retrieval Metrics:")
print(tfidf_metrics)

###BM25 model

In [None]:
# from collections import defaultdict

# def simple_tokenize(text):
#     if not isinstance(text, str):
#         return []
#     return text.lower().split()

# #corpus for BM25
# train_corpus = [simple_tokenize(doc) for doc in train_df["text"].fillna("")]
# bm25 = BM25Okapi(train_corpus)

# qset = test_df[test_df["dup_group"].notna()].copy()
# q_tokens = [simple_tokenize(t) for t in qset["text"].fillna("")]

# ranks_overall_bm25 = []
# ranks_by_project_bm25 = {p: [] for p in qset["project"].unique()}

# print("Running BM25 retrieval for", len(qset), "queries...")

# for (idx, row), query_tokens in zip(qset.iterrows(), q_tokens):
#     scores = bm25.get_scores(query_tokens)       # len = n_train
#     r = first_relevant_rank(
#         np.array(scores),
#         train_group=train_group,
#         target_group=row["dup_group"],
#     )
#     ranks_overall_bm25.append(r)
#     ranks_by_project_bm25[row["project"]].append(r)


In [None]:
# # Summaries for bm25

# overall_bm25 = summarize(ranks_overall_bm25)
# project_bm25 = {p: summarize(r) for p, r in ranks_by_project_bm25.items()}

# bm25_metrics = pd.DataFrame(
#     [overall_bm25] + list(project_bm25.values()),
#     index=["OVERALL"] + list(project_bm25.keys()),
#     columns=["R@1", "R@5", "R@10", "MRR"],
# )

# print("\nBM25 Retrieval Metrics:")
# print(bm25_metrics)


In [None]:
!pip install -q "bm25s[full]"

import bm25s
import numpy as np

corpus = train_df["text"].fillna("").astype(str).tolist()
print("Number of docs in train_df:", len(corpus))

corpus_tokens = [doc.lower().split() for doc in corpus]
print("Number of tokenized docs:", len(corpus_tokens))

bm25_retriever = bm25s.BM25()          # Lucene-style BM25 by default
bm25_retriever.index(corpus_tokens)    # index the tokenized documents

print("BM25S index built on", len(corpus_tokens), "documents.")


In [None]:
ranks_overall_bm25 = []
ranks_by_project_bm25 = {p: [] for p in qset["project"].unique()}

print("Running BM25S retrieval for", len(qset), "queries...")

for _, row in qset.iterrows():
    query_text = row["text"] if isinstance(row["text"], str) else ""
    query_tokens = query_text.lower().split()

    scores = np.array(bm25_retriever.get_scores(query_tokens))  # shape: (n_train,)

    r = first_relevant_rank(
        scores,
        train_group=train_group,
        target_group=row["dup_group"],
    )

    ranks_overall_bm25.append(r)
    ranks_by_project_bm25[row["project"]].append(r)

# Summaries for BM25
overall_bm25 = summarize(ranks_overall_bm25)
project_bm25 = {}

for project in ranks_by_project_bm25:
    ranks = ranks_by_project_bm25[project]
    metrics = summarize(ranks)
    project_bm25[project] = metrics

bm25s_metrics = pd.DataFrame(
    [overall_bm25] + list(project_bm25.values()),
    index=["OVERALL"] + list(project_bm25.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nBM25S Retrieval Metrics:")
print(bm25s_metrics)

###SBERT Model

In [None]:
# SBERT model – building embeddings for all train documents

!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
import torch
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
sbert_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device=device,
)

# texts for the index
train_texts = train_df["text"].fillna("").astype(str).tolist()

train_embs = sbert_model.encode(
    train_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,   # so cosine similarity = dot product
    show_progress_bar=True,
)

print("Train SBERT embeddings shape:", train_embs.shape)


In [None]:
# SBERT retrieval and evaluation

# only queries that actually have a dup_group
print("SBERT: queries with ground truth duplicates:", len(qset))

query_texts = qset["text"].fillna("").astype(str).tolist()

query_embs = sbert_model.encode(
    query_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,   # keep unit vectors
    show_progress_bar=True,
)

ranks_overall_sbert = []
ranks_by_project_sbert = {p: [] for p in qset["project"].unique()}

train_group = train_df["dup_group"].to_numpy()

for i, (_, row) in enumerate(qset.iterrows()):
    q_emb = query_embs[i]
    scores = train_embs @ q_emb    # cosine via dot product

    r = first_relevant_rank(
        scores,
        train_group=train_group,
        target_group=row["dup_group"],
    )

    ranks_overall_sbert.append(r)
    ranks_by_project_sbert[row["project"]].append(r)


overall_sbert = summarize(ranks_overall_sbert)

project_sbert = {}
for project, ranks in ranks_by_project_sbert.items():
    metrics = summarize(ranks)
    project_sbert[project] = metrics

sbert_metrics = pd.DataFrame(
    [overall_sbert] + list(project_sbert.values()),
    index=["OVERALL"] + list(project_sbert.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nSBERT Retrieval Metrics:")
print(sbert_metrics)


In [None]:
# Combined Comparison Table

overall_tfidf = tfidf_metrics.loc["OVERALL"]
overall_bm25  = bm25s_metrics.loc["OVERALL"]
overall_sbert = sbert_metrics.loc["OVERALL"]

comparison_df = pd.DataFrame({
    "TF-IDF": overall_tfidf,
    "BM25S": overall_bm25,
    "SBERT": overall_sbert
}).T  # transpose so models are rows

comparison_df = comparison_df.round(4)

print("\n Combined Retrieval Performance (Overall Metrics):")
display(comparison_df)


###Fine-Tuned SBERT

In [None]:
from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader
import random

In [None]:
train_examples = []
grouped = train_df.dropna(subset=["dup_group"]).groupby("dup_group")

# Building positive pairs from duplicate groups
for group_id, group in grouped:
    texts = group["text"].fillna("").astype(str).tolist()
    if len(texts) < 2:
      continue # need at least a pair

    random.shuffle(texts)

    # consecutive pairs inside the group
    num_texts = len(texts)
    for i in range(num_texts - 1):
        ex = InputExample(texts=[texts[i], texts[i + 1]])
        train_examples.append(ex)

    # wrap-around pair (last with first) to add one more positive
    if num_texts > 2:
        ex = InputExample(texts=[texts[-1], texts[0]])
        train_examples.append(ex)

print("Number of training pairs:", len(train_examples))

In [None]:
tuned_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device=device,
)

train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=32,
)

train_loss = losses.MultipleNegativesRankingLoss(model=tuned_model)

tuned_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True,
)

# Re-encoding train and query texts with the tuned model
tuned_train_embs = tuned_model.encode(
    train_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
)

tuned_query_embs = tuned_model.encode(
    query_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
)

print("Tuned SBERT embedding shapes:", tuned_train_embs.shape, tuned_query_embs.shape)

In [None]:
ranks_overall_tuned = []
ranks_by_project_tuned = {p: [] for p in qset["project"].unique()}

train_group_arr = train_df["dup_group"].to_numpy()

for i, (_, row) in enumerate(qset.iterrows()):
    q_emb = tuned_query_embs[i]
    scores = tuned_train_embs @ q_emb   # cosine via dot product

    r = first_relevant_rank(
        scores,
        train_group=train_group_arr,
        target_group=row["dup_group"],
    )

    ranks_overall_tuned.append(r)
    ranks_by_project_tuned[row["project"]].append(r)

overall_tuned = summarize(ranks_overall_tuned)

project_tuned = {}
for project, ranks in ranks_by_project_tuned.items():
    metrics = summarize(ranks)
    project_tuned[project] = metrics

tuned_sbert_metrics = pd.DataFrame(
    [overall_tuned] + list(project_tuned.values()),
    index=["OVERALL"] + list(project_tuned.keys()),
    columns=["R@1", "R@5", "R@10", "MRR"],
)

print("\nFine-tuned SBERT Retrieval Metrics:")
print(tuned_sbert_metrics)


In [None]:
overall_tfidf = tfidf_metrics.loc["OVERALL"]
overall_bm25  = bm25s_metrics.loc["OVERALL"]
overall_sbert = sbert_metrics.loc["OVERALL"]
overall_tuned = tuned_sbert_metrics.loc["OVERALL"]

comparison_with_tuned = pd.DataFrame({
    "TF-IDF":          overall_tfidf,
    "BM25S":           overall_bm25,
    "SBERT (base)":    overall_sbert,
    "SBERT (tuned)":   overall_tuned,
}).T.round(4)

print("\nCombined Retrieval Performance with Fine-tuned SBERT (Overall Metrics):")
display(comparison_with_tuned)

In [None]:
# Distribution of duplicate groups with ticket IDs

# Filter only rows that have a duplicate group ID
dup_df = train_df.dropna(subset=["dup_group"])[["issue_id", "dup_group"]]

# Count total tickets per group
group_sizes = dup_df.groupby("dup_group")["issue_id"].count().sort_values(ascending=False)

print("Number of duplicate groups:", len(group_sizes))
print("\nOverall distribution of group sizes:")
print(group_sizes.describe())

# Show groups with ≥3 tickets (actual duplicate clusters)
multi_groups = group_sizes[group_sizes >= 5]

print("\nGroups with 3 or more tickets (true duplicate clusters):")
print(multi_groups.head(20))   # show first 20 for preview

In [None]:

# For more detailed inspection: group_id → list of all ticket IDs
group_to_ids = dup_df.groupby("dup_group")["issue_id"].apply(list)

# Combine size + IDs into one DataFrame
dup_dist_df = pd.DataFrame(
    {
        "group_size": group_sizes,
        "ticket_ids": group_to_ids,
    }
).sort_values("group_size", ascending=False)

print("\nFull duplicate-group distribution with ticket ID lists:")
display(dup_dist_df.head(20))

In [None]:

train_df["issue_id"] = train_df["issue_id"].astype("Int64")
train_df["dup_group"] = train_df["dup_group"].astype("Int64")

dup_df = train_df.dropna(subset=["dup_group"])[["issue_id", "dup_group"]]

group_sizes = (
    dup_df
    .groupby("dup_group")["issue_id"]
    .count()
    .sort_values(ascending=False)
)

group_to_ids = dup_df.groupby("dup_group")["issue_id"].apply(list)

print("Number of duplicate groups:", len(group_sizes))
print("\nOverall distribution of group sizes:")
print(group_sizes.describe())

#Distribution of duplicates per anchor ticket (size - 1)

num_duplicates = group_sizes - 1
num_duplicates.name = "num_duplicates"

print("\nDistribution of number of duplicates per anchor ticket:")
print(num_duplicates.describe())

print("\nCounts for groups with 0–24 duplicates (excluding anchor):")
dup_hist = num_duplicates.value_counts().sort_index()
#dup_hist = dup_hist[dup_hist.index <= 10]
print(dup_hist)


example_group = 1942560

if example_group in group_to_ids.index:
    print(f"\nDetails for duplicate group {example_group}:")
    print("All ticket IDs in this group:", group_to_ids.loc[example_group])
    print("Total tickets in group:", int(group_sizes.loc[example_group]))
    print("Number of duplicates (excluding anchor):", int(num_duplicates.loc[example_group]))
else:
    print(f"\nDuplicate group {example_group} not found in this dataset.")


In [None]:
print(qset.head())

In [None]:
# Fill NA sizes with 0 (these queries have no known duplicate group)
qset["dup_group_size"] = qset["dup_group"].map(group_sizes)
qset["dup_group_size"] = qset["dup_group_size"].fillna(0).astype(int)

# Show query duplicate-group size distribution
# print("Query group size distribution:")
# print(qset["dup_group_size"].describe())

# Count duplicates (size-1)
qset_num_duplicates = qset["dup_group_size"] - 1
qset_num_duplicates.name = "num_duplicates"
print("\nCounts of query duplicate-group sizes (0–24 duplicates):")
print(qset_num_duplicates.value_counts().sort_index())

# Threshold: groups with >=3 duplicates -> size >=4 total tickets
SIZE_THRESHOLD = 4
mask_high = qset["dup_group_size"] >= SIZE_THRESHOLD
mask_low = qset["dup_group_size"] < SIZE_THRESHOLD

print("\nQueries in high-support groups (size >= 4):", mask_high.sum())
print("Queries in low-support groups  (size <  4):", mask_low.sum())

# Convert masks to array indices
idx_high = np.where(mask_high.to_numpy())[0]
#idx_low = np.where(mask_low.to_numpy())[0]

def subset_ranks(ranks, idxs):
    return [ranks[i] for i in idxs]

# Subset ranks for each model
tfidf_high = subset_ranks(ranks_overall, idx_high)
#tfidf_low = subset_ranks(ranks_overall, idx_low)

bm25_high = subset_ranks(ranks_overall_bm25, idx_high)
#bm25_low = subset_ranks(ranks_overall_bm25, idx_low)

sbert_high = subset_ranks(ranks_overall_sbert, idx_high)
#sbert_low = subset_ranks(ranks_overall_sbert, idx_low)

tuned_high = subset_ranks(ranks_overall_tuned, idx_high)
#tuned_low = subset_ranks(ranks_overall_tuned, idx_low)

# Build comparison table
results_support = pd.DataFrame({
    "TF-IDF (all)": pd.Series(summarize(ranks_overall)),
    "TF-IDF (high)": pd.Series(summarize(tfidf_high)),
    #"TF-IDF (low)": pd.Series(summarize(tfidf_low)),

    "BM25S (all)": pd.Series(summarize(ranks_overall_bm25)),
    "BM25S (high)": pd.Series(summarize(bm25_high)),
   # "BM25S (low)": pd.Series(summarize(bm25_low)),

    "SBERT base (all)": pd.Series(summarize(ranks_overall_sbert)),
    "SBERT base (high)": pd.Series(summarize(sbert_high)),
   # "SBERT base (low)": pd.Series(summarize(sbert_low)),

    "SBERT tuned (all)": pd.Series(summarize(ranks_overall_tuned)),
    "SBERT tuned (high)": pd.Series(summarize(tuned_high)),
   # "SBERT tuned (low)": pd.Series(summarize(tuned_low)),
}).T.round(4)

results_support_pct = (results_support * 100).round(1)

print("Performance by duplicate-group support level in %:")
display(results_support_pct)

# print("\nPerformance by duplicate-group support level:")
# display(results_support)


In [None]:
def subset_ranks(ranks, idxs):
    return [ranks[i] for i in idxs]

# indices for queries that are NOT from Firefox
mask_no_firefox = qset["project"] != "firefox"
idx_no_firefox = np.where(mask_no_firefox.to_numpy())[0]

# Overall metrics with all three projects
tfidf_all = summarize(ranks_overall)
bm25_all = summarize(ranks_overall_bm25)
sbert_all = summarize(ranks_overall_sbert)
tuned_all = summarize(ranks_overall_tuned)

# Overall metrics with only Hadoop + HBase (no Firefox)
tfidf_no_ff = summarize(subset_ranks(ranks_overall, idx_no_firefox))
bm25_no_ff = summarize(subset_ranks(ranks_overall_bm25, idx_no_firefox))
sbert_no_ff = summarize(subset_ranks(ranks_overall_sbert, idx_no_firefox))
tuned_no_ff = summarize(subset_ranks(ranks_overall_tuned, idx_no_firefox))

results_projects = pd.DataFrame({
    "TF-IDF (all projects)": pd.Series(tfidf_all),
    "TF-IDF (no Firefox)":   pd.Series(tfidf_no_ff),

    "BM25S (all projects)":  pd.Series(bm25_all),
    "BM25S (no Firefox)":    pd.Series(bm25_no_ff),

    "SBERT base (all)":      pd.Series(sbert_all),
    "SBERT base (no Firefox)": pd.Series(sbert_no_ff),

    "SBERT tuned (all)":     pd.Series(tuned_all),
    "SBERT tuned (no Firefox)": pd.Series(tuned_no_ff),
}).T

results_projects_pct = (results_projects * 100).round(1)

print("\nPerformance with and without Firefox (%, 1 decimal):")
display(results_projects_pct)

###Win/loss summary (SBERT tuned vs BM25S, using existing ranks)

In [None]:
# Convert rank lists to numpy arrays
bm25_ranks = np.array(bm25_high, dtype=float)
sbert_base_ranks = np.array(sbert_high, dtype=float)
sbert_tuned_ranks = np.array(tuned_high, dtype=float)

# Treat "no hit" (np.inf) as a very bad rank for comparison
BIG = 1e9
bm25_eff = np.where(np.isfinite(bm25_ranks), bm25_ranks, BIG)
sbert_base_eff = np.where(np.isfinite(sbert_base_ranks), sbert_base_ranks, BIG)
sbert_tuned_eff = np.where(np.isfinite(sbert_tuned_ranks), sbert_tuned_ranks, BIG)

n_queries = len(bm25_ranks)

# Win/loss masks for tuned SBERT vs BM25
mask_sbert_tuned_better = sbert_tuned_eff < bm25_eff
mask_bm25_better = bm25_eff < sbert_tuned_eff
mask_tied = sbert_tuned_eff == bm25_eff

count_sbert_tuned_better = int(mask_sbert_tuned_better.sum())
count_bm25_better = int(mask_bm25_better.sum())
count_tied = int(mask_tied.sum())

summary_winloss = pd.DataFrame(
    {
        "count": [
            count_sbert_tuned_better,
            count_bm25_better,
            count_tied,
        ],
        "percent": [
            100.0 * count_sbert_tuned_better / n_queries,
            100.0 * count_bm25_better / n_queries,
            100.0 * count_tied / n_queries,
        ],
    },
    index=[
        "SBERT tuned better",
        "BM25S better",
        "Tie / both similar",
    ],
).round(1)

print("Win/loss comparison between SBERT (tuned) and BM25S for Queries in high-support groups:")
display(summary_winloss)


In [None]:

winloss_masks = {
    "sbert_tuned_better": mask_sbert_tuned_better,
    "bm25_better": mask_bm25_better,
    "tie": mask_tied,
}

In [None]:
# train_group_arr = train_df["dup_group"].to_numpy()
# train_issue_ids = train_df["issue_id"].to_numpy()
# train_texts = train_df["text"].fillna("").astype(str).tolist()


target_group_size = 10  # (= 1 anchor + 9 duplicates)
group_sizes = (
    train_df.dropna(subset=["dup_group"])
            .groupby("dup_group")["issue_id"]
            .count()
)

# Find all query indices in qset whose group size == 10
q_group_sizes = qset["dup_group"].map(group_sizes)
candidate_indices = q_group_sizes[q_group_sizes == target_group_size].index.tolist()

example_idx = min(candidate_indices)   # pick the first — deterministic

row = qset.loc[example_idx]

print("=== Selected Query Example ===")
print("Query index:", example_idx)
print("Project:", row["project"])
print("Duplicate group:", row["dup_group"])
print("Group size:", target_group_size)
print("\nQuery text:\n", row["text"][:400].replace("\n", " "))

# All true duplicate IDs in this group
true_dup_ids = train_df.loc[
    train_df["dup_group"] == row["dup_group"],
    "issue_id"
].tolist()

print("\nTrue duplicate issue IDs in this group:")
print(true_dup_ids)

dup_texts = train_df.loc[train_df["issue_id"].isin(true_dup_ids), "text"].fillna("").astype(str).tolist()
print("\nExample duplicate ticket texts:")
for i, t in enumerate(dup_texts[4:8]):   # show few examples
    print(f"\nDuplicate {i+1}:\n{t[:300].replace('\n', ' ')}")

In [None]:
rank_bm25  = ranks_overall_bm25[example_idx]
rank_base  = ranks_overall_sbert[example_idx]
rank_tuned = ranks_overall_tuned[example_idx]

print("\n=== Precomputed Ranks (from earlier cells) ===")
print(f"BM25S rank:        {rank_bm25}")
print(f"SBERT base rank:   {rank_base}")
print(f"SBERT tuned rank:  {rank_tuned}")

# Determine winner based on tuned SBERT vs BM25
if rank_tuned < rank_bm25:
    print("\nWinner for this query: SBERT tuned")
elif rank_bm25 < rank_tuned:
    print("\nWinner for this query: BM25S")
else:
    print("\nWinner for this query: Tie")

In [None]:
# Recompute top-k retrieved docs for BM25 and SBERT-tuned

k = 10
query_text = row["text"]
print("Query text:\n", query_text)

# BM25 retrieval
tokens = query_text.lower().split()
bm25_scores = np.array(bm25_retriever.get_scores(tokens))
order_bm25 = np.argsort(-bm25_scores)

# SBERT tuned retrieval
q_emb = tuned_query_embs[example_idx]
sbert_scores = tuned_train_embs @ q_emb
order_sbert = np.argsort(-sbert_scores)

# Pre-extracted arrays for convenience
train_ids = train_df["issue_id"].astype(int).tolist()
train_groups = train_df["dup_group"].astype(float).tolist()
train_texts = train_df["text"].fillna("").astype(str).tolist()
true_group = float(row["dup_group"])

# Helper to print tables cleanly
def print_ranked(label, order, scores):
    print(f"\nTop {k} results — {label}")
    for r in range(k):
        doc = order[r]
        issue_id = train_ids[doc]
        group_val = float(train_groups[doc])
        is_dup = (group_val == true_group)
        mark = "<-- TRUE DUP" if is_dup else ""
        snippet = train_texts[doc][:150].replace("\n", " ")
        print(f"{r+1:2d}. id={issue_id}  {snippet}  {mark}")

# Determine first true-duplicate rank
def first_dup(order):
    for r, doc in enumerate(order):
        if float(train_groups[doc]) == true_group:
            return r + 1
    return None

rank_bm25 = first_dup(order_bm25)
rank_tuned = first_dup(order_sbert)

#Print results
print_ranked("BM25", order_bm25, bm25_scores)
print(f"\nBM25 first true-duplicate rank = {rank_bm25}")

print_ranked("SBERT tuned", order_sbert, sbert_scores)
print(f"\nSBERT tuned first true-duplicate rank = {rank_tuned}")

#Show a couple of TRUE duplicate texts
print("\n=== Example duplicate tickets' texts ===")
for tid in true_dup_ids[4:8]:       # just few duplicates
    doc = train_ids.index(tid)
    text_snip = train_texts[doc][:400].replace("\n", " ")
    print(f"\nDuplicate id={tid}:")
    print(text_snip)


In [None]:
#Project-wise comparison

bm25_all = np.array(ranks_overall_bm25, dtype=float)
tuned_all = np.array(ranks_overall_tuned, dtype=float)

projects = sorted(qset["project"].unique().tolist())
rows = []

for p in projects:
    mask = (qset["project"] == p).to_numpy()

    bm25_proj = bm25_all[mask].tolist()
    tuned_proj = tuned_all[mask].tolist()

    bm25_stats = summarize(bm25_proj)
    tuned_stats = summarize(tuned_proj)

    # Extract only R@10 + MRR
    bm25_r10 = bm25_stats["R@10"]
    bm25_mrr = bm25_stats["MRR"]
    tuned_r10 = tuned_stats["R@10"]
    tuned_mrr = tuned_stats["MRR"]

    # Determine winner
    if tuned_mrr > bm25_mrr:
        winner = "SBERT tuned"
    elif bm25_mrr > tuned_mrr:
        winner = "BM25S"
    else:
        winner = "Tie"

    rows.append({
        "project": p,
        "BM25S_R@10 (%)": bm25_r10 * 100,
        "BM25S_MRR (%)": bm25_mrr * 100,
        "SBERT_tuned_R@10 (%)": tuned_r10 * 100,
        "SBERT_tuned_MRR (%)": tuned_mrr * 100,
        "Winner_by_MRR": winner,
    })

df_proj_pct = (
    pd.DataFrame(rows)
      .set_index("project")
      .round(1)
)

print("\nPer-project comparison:")
display(df_proj_pct)
