In [2]:
pip install rank-bm25 sentence-transformers torch torch-geometric numpy scikit-learn

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux201

##BM25 vs GNN

In [6]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

# Step 1: Synthetic Corpus and Query
corpus = [
    "The theory of relativity was developed by Albert Einstein in 1915.",
    "Einstein's work on general relativity revolutionized physics.",
    "Quantum mechanics emerged in the early 20th century with contributions from Planck and Heisenberg.",
    "Special relativity describes the behavior of objects moving at high speeds.",
    "The history of physics includes major discoveries by Newton and Einstein."
]
query = "What is the theory of relativity?"

# Synthetic relevance labels (1 = relevant, 0 = less relevant)
relevance_labels = {0: 1, 3: 1, 1: 1, 4: 0, 2: 0}  # Docs 0, 3, 1 relevant; Docs 4, 2 less relevant

# Step 2: Initial Retrieval with BM25
tokenized_corpus = [doc.lower().split() for doc in corpus]
tokenized_query = query.lower().split()
bm25 = BM25Okapi(tokenized_corpus)
bm25_scores = bm25.get_scores(tokenized_query)
k = 4
top_k_indices = np.argsort(bm25_scores)[::-1][:k]
initial_ranking = [(idx, corpus[idx], bm25_scores[idx]) for idx in top_k_indices]
print("Initial BM25 Ranking:")
for idx, doc, score in initial_ranking:
    print(f"Doc {idx}: {doc} (Score: {score:.4f})")

# Step 3: Encoding Documents and Query
model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
query_embedding = model.encode([query], convert_to_tensor=True, show_progress_bar=False)[0]
initial_scores = cosine_similarity(query_embedding.unsqueeze(0).cpu().numpy(),
                                  document_embeddings.cpu().numpy())[0]
print("\nInitial Cosine Similarity Scores:")
for idx, score in enumerate(initial_scores):
    print(f"Doc {idx}: {score:.4f}")

# Step 4: Graph Construction
document_embeddings_np = document_embeddings.cpu().numpy()
selected_embeddings_np = document_embeddings_np[top_k_indices].copy()
print("\nSelected embeddings strides:", selected_embeddings_np.strides)
similarity_matrix = cosine_similarity(selected_embeddings_np)
print("\nSimilarity Matrix:")
for i in range(k):
    print([f"{similarity_matrix[i, j]:.4f}" for j in range(k)])

# Use k-NN (2 neighbors) for edge construction
edge_index = []
edge_weight = []
added_pairs = set()
for i in range(k):
    sim_scores = similarity_matrix[i].copy()
    sim_scores[i] = -1  # Exclude self
    top_neighbors = np.argsort(sim_scores)[::-1][:2]  # Top 2 neighbors
    for neighbor in top_neighbors:
        pair = tuple(sorted([i, neighbor]))
        if pair not in added_pairs:
            edge_index.append([i, neighbor])
            edge_index.append([neighbor, i])
            edge_weight.append(similarity_matrix[i, neighbor])
            edge_weight.append(similarity_matrix[i, neighbor])
            added_pairs.add(pair)

if not edge_index:
    print("Warning: No edges formed. Using dummy edge.")
    edge_index = torch.tensor([[0, 0]], dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor([1.0], dtype=torch.float)
else:
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
print("\nEdges formed:", edge_index.t().tolist())

# Node features: Combine document embeddings with query relevance
node_features = []
for idx in top_k_indices:
    doc_query_feature = document_embeddings[idx] * query_embedding
    node_features.append(doc_query_feature.cpu().numpy())
node_features = torch.tensor(node_features, dtype=torch.float)

graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)

# Step 5: Graph Neural Network for Re-ranking
class GNNReRanker(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNNReRanker, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.scorer = nn.Linear(hidden_dim + input_dim, 1)
        self.dropout = nn.Dropout(0.5)  # Increased dropout

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = torch.cat([x, data.x], dim=-1)
        scores = self.scorer(x).squeeze(-1)
        return scores

# Initialize GNN and optimizer
input_dim = node_features.shape[1]
hidden_dim = 128
gnn_model = GNNReRanker(input_dim, hidden_dim)
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.01, weight_decay=1e-4)  # L2 regularization

# Step 6: Train the GNN with early stopping
gnn_model.train()
best_loss = float('inf')
patience = 10
patience_counter = 0
for epoch in range(200):
    optimizer.zero_grad()
    scores = gnn_model(graph_data)
    loss = 0
    for i in range(k):
        for j in range(i + 1, k):
            idx_i, idx_j = top_k_indices[i], top_k_indices[j]
            if idx_i in relevance_labels and idx_j in relevance_labels:
                if relevance_labels[idx_i] > relevance_labels[idx_j]:
                    loss += F.relu(scores[j] - scores[i] + 0.1)
                elif relevance_labels[idx_j] > relevance_labels[idx_i]:
                    loss += F.relu(scores[i] - scores[j] + 0.1)
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    if loss.item() < best_loss:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

# Step 7: Inference with Trained GNN
gnn_model.eval()
with torch.no_grad():
    gnn_scores = gnn_model(graph_data)

# Step 8: Combine GNN Scores with Initial Scores
bm25_top_k = torch.tensor([bm25_scores[idx] for idx in top_k_indices], dtype=torch.float)
bm25_top_k = torch.sigmoid(bm25_top_k)  # Softer normalization
gnn_scores = torch.sigmoid(gnn_scores)  # Softer normalization
final_scores = 0.5 * bm25_top_k + 0.5 * gnn_scores  # Balanced weighting

# Step 9: Final Re-ranking
reranked_indices = torch.argsort(final_scores, descending=True)
print("\nFinal Re-ranked List:")
for rank, rerank_idx in enumerate(reranked_indices):
    orig_idx = top_k_indices[rerank_idx]
    print(f"Rank {rank+1}: Doc {orig_idx}: {corpus[orig_idx]} (Final Score: {final_scores[rerank_idx]:.4f})")

Initial BM25 Ranking:
Doc 0: The theory of relativity was developed by Albert Einstein in 1915. (Score: 1.5514)
Doc 4: The history of physics includes major discoveries by Newton and Einstein. (Score: 0.4619)
Doc 3: Special relativity describes the behavior of objects moving at high speeds. (Score: 0.4619)
Doc 2: Quantum mechanics emerged in the early 20th century with contributions from Planck and Heisenberg. (Score: 0.2055)

Initial Cosine Similarity Scores:
Doc 0: 0.7309
Doc 1: 0.5700
Doc 2: 0.2514
Doc 3: 0.5889
Doc 4: 0.4414

Selected embeddings strides: (1536, 4)

Similarity Matrix:
['1.0000', '0.5990', '0.4308', '0.3947']
['0.5990', '1.0000', '0.3185', '0.5143']
['0.4308', '0.3185', '1.0000', '0.1448']
['0.3947', '0.5143', '0.1448', '1.0000']

Edges formed: [[0, 1], [1, 0], [0, 2], [2, 0], [1, 3], [3, 1], [2, 1], [1, 2], [3, 0], [0, 3]]
Epoch 0, Loss: 0.3978
Epoch 20, Loss: 0.0620
Early stopping at epoch 35

Final Re-ranked List:
Rank 1: Doc 0: The theory of relativity was develo

##BM25 vs GNN+ GAR

In [9]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

# Step 1: Synthetic Corpus and Query
corpus = [
    "The theory of relativity was developed by Albert Einstein in 1915.",
    "Einstein's work on general relativity revolutionized physics.",
    "Quantum mechanics emerged in the early 20th century with contributions from Planck and Heisenberg.",
    "Special relativity describes the behavior of objects moving at high speeds.",
    "The history of physics includes major discoveries by Newton and Einstein."
]
query = "What is the theory of relativity?"
relevance_labels = {0: 1, 3: 1, 1: 1, 4: 0, 2: 0}
k = 4

# Step 2: Query Augmentation with T5
def generate_augmented_queries(query, model, tokenizer, device='cpu', num_contexts=3):
    input_text = f"Generate keywords for: {query}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=50,
        num_return_sequences=num_contexts,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    augmented_queries = [query]
    for output in outputs:
        context = tokenizer.decode(output, skip_special_tokens=True)
        augmented_queries.append(f"{query} {context}")
    return augmented_queries

# Initialize T5 model
try:
    t5_model_name = "t5-base"
    t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
    t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
except Exception:
    t5_model_name = "t5-small"
    t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
    t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model = t5_model.to(device)
augmented_queries = generate_augmented_queries(query, t5_model, t5_tokenizer, device)
print("Augmented Queries:")
for aq in augmented_queries:
    print(f"- {aq}")

# Step 3: BM25 Retrieval with Augmented Queries and Hybrid Fallback
tokenized_corpus = [doc.lower().split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
retrieved_indices = set()
bm25_scores = np.zeros(len(corpus))

for aq in augmented_queries:
    tokenized_aq = aq.lower().split()
    scores = bm25.get_scores(tokenized_aq)
    top_indices = np.argsort(scores)[::-1][:k]
    for idx in top_indices:
        retrieved_indices.add(idx)
        bm25_scores[idx] = max(bm25_scores[idx], scores[idx])

# Hybrid retrieval
model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
query_embedding = model.encode([query], convert_to_tensor=True, show_progress_bar=False)[0]
cosine_scores = cosine_similarity(query_embedding.unsqueeze(0).cpu().numpy(),
                                 document_embeddings.cpu().numpy())[0]
print("\nRaw BM25 and Cosine Scores for All Documents:")
for idx in range(len(corpus)):
    print(f"Doc {idx}: BM25={bm25_scores[idx]:.4f}, Cosine={cosine_scores[idx]:.4f}")

# Normalize and combine scores
bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
cosine_norm = (cosine_scores - cosine_scores.min()) / (cosine_scores.max() - cosine_scores.min() + 1e-10)
combined_scores = 0.5 * bm25_norm + 0.5 * cosine_norm
top_k_indices = np.argsort(combined_scores)[::-1][:k]
initial_ranking = [(idx, corpus[idx], combined_scores[idx], bm25_scores[idx], cosine_scores[idx]) for idx in top_k_indices]
print("\nInitial Combined Ranking (GAR + Cosine):")
for idx, doc, comb_score, bm25_score, cos_score in initial_ranking:
    print(f"Doc {idx}: {doc} (Combined: {comb_score:.4f}, BM25: {bm25_score:.4f}, Cosine: {cos_score:.4f})")

# Step 4: Graph Construction
document_embeddings_np = document_embeddings.cpu().numpy()
selected_embeddings_np = document_embeddings_np[top_k_indices].copy()
print("\nSelected embeddings strides:", selected_embeddings_np.strides)
similarity_matrix = cosine_similarity(selected_embeddings_np)
print("\nSimilarity Matrix:")
for i in range(k):
    print([f"{similarity_matrix[i, j]:.4f}" for j in range(k)])

# k-NN (2 neighbors) for edges
edge_index = []
edge_weight = []
added_pairs = set()
for i in range(k):
    sim_scores = similarity_matrix[i].copy()
    sim_scores[i] = -1
    top_neighbors = np.argsort(sim_scores)[::-1][:2]
    for neighbor in top_neighbors:
        pair = tuple(sorted([i, neighbor]))
        if pair not in added_pairs:
            edge_index.append([i, neighbor])
            edge_index.append([neighbor, i])
            edge_weight.append(similarity_matrix[i, neighbor])
            edge_weight.append(similarity_matrix[i, neighbor])
            added_pairs.add(pair)

if not edge_index:
    print("Warning: No edges formed. Using dummy edge.")
    edge_index = torch.tensor([[0, 0]], dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor([1.0], dtype=torch.float)
else:
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
print("\nEdges formed:", edge_index.t().tolist())

# Node features
node_features = []
for idx in top_k_indices:
    doc_query_feature = document_embeddings[idx] * query_embedding
    cosine_feature = cosine_scores[idx]
    feature = np.concatenate([doc_query_feature.cpu().numpy(), [cosine_feature]])
    node_features.append(feature)
node_features = torch.tensor(node_features, dtype=torch.float)
graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)

# Step 5: GNN for Re-ranking
class GNNReRanker(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNNReRanker, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.scorer = nn.Linear(hidden_dim + input_dim, 1)
        self.dropout = nn.Dropout(0.6)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = torch.cat([x, data.x], dim=-1)
        scores = self.scorer(x).squeeze(-1)
        return scores

# Initialize GNN
input_dim = node_features.shape[1]
hidden_dim = 128
gnn_model = GNNReRanker(input_dim, hidden_dim)
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.01, weight_decay=1e-3)

# Step 6: Train GNN
gnn_model.train()
best_loss = float('inf')
patience = 10
patience_counter = 0
min_loss_threshold = 1e-4
for epoch in range(200):
    optimizer.zero_grad()
    scores = gnn_model(graph_data)
    loss = 0
    for i in range(k):
        for j in range(i + 1, k):
            idx_i, idx_j = top_k_indices[i], top_k_indices[j]
            if idx_i in relevance_labels and idx_j in relevance_labels:
                if relevance_labels[idx_i] > relevance_labels[idx_j]:
                    loss += F.relu(scores[j] - scores[i] + 0.2)  # Increased margin
                elif relevance_labels[idx_j] > relevance_labels[idx_i]:
                    loss += F.relu(scores[i] - scores[j] + 0.2)
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    if loss.item() < best_loss and loss.item() > min_loss_threshold:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

# Step 7: Inference
gnn_model.eval()
with torch.no_grad():
    gnn_scores = gnn_model(graph_data)
print("\nRaw GNN Scores:")
for idx, score in zip(top_k_indices, gnn_scores):
    print(f"Doc {idx}: {score:.4f}")

# Step 8: Combine Scores
bm25_top_k = torch.tensor([bm25_scores[idx] for idx in top_k_indices], dtype=torch.float)
bm25_top_k = torch.sigmoid(bm25_top_k)
gnn_scores = torch.sigmoid(gnn_scores)
final_scores = 0.3 * bm25_top_k + 0.7 * gnn_scores
print("\nFinal Score Components:")
for idx, bm25_s, gnn_s, final_s in zip(top_k_indices, bm25_top_k, gnn_scores, final_scores):
    print(f"Doc {idx}: BM25_Sigmoid={bm25_s:.4f}, GNN_Sigmoid={gnn_s:.4f}, Final={final_s:.4f}")

# Step 9: Final Re-ranking
reranked_indices = torch.argsort(final_scores, descending=True)
print("\nFinal Re-ranked List (GAR + Cosine + GNN):")
for rank, rerank_idx in enumerate(reranked_indices):
    orig_idx = top_k_indices[rerank_idx]
    print(f"Rank {rank+1}: Doc {orig_idx}: {corpus[orig_idx]} (Final Score: {final_scores[rerank_idx]:.4f})")

Augmented Queries:
- What is the theory of relativity?
- What is the theory of relativity? for: General keywords for: What is the theory of relativity?
- What is the theory of relativity? for: for: Generate new keywords for: What is the theory of relativity?
- What is the theory of relativity? for: for: Generate keywords for: What is the theory of relativity?

Raw BM25 and Cosine Scores for All Documents:
Doc 0: BM25=3.1028, Cosine=0.7309
Doc 1: BM25=1.3053, Cosine=0.5700
Doc 2: BM25=0.4109, Cosine=0.2514
Doc 3: BM25=0.9238, Cosine=0.5889
Doc 4: BM25=0.9238, Cosine=0.4414

Initial Combined Ranking (GAR + Cosine):
Doc 0: The theory of relativity was developed by Albert Einstein in 1915. (Combined: 1.0000, BM25: 3.1028, Cosine: 0.7309)
Doc 1: Einstein's work on general relativity revolutionized physics. (Combined: 0.4983, BM25: 1.3053, Cosine: 0.5700)
Doc 3: Special relativity describes the behavior of objects moving at high speeds. (Combined: 0.4472, BM25: 0.9238, Cosine: 0.5889)
Doc 4:

**Note**: here better generative models like gpt 4o series can be used with much refined prompt to see better effect of GAR. Here my aim is to show you how to do it, rest it can always be refined or improved.

##BM25 vs GAR

In [11]:
# Required packages: pip install rank_bm25 sentence-transformers torch torch-geometric numpy sklearn
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

# Step 1: Synthetic Corpus and Query
corpus = [
    "The theory of relativity was developed by Albert Einstein in 1915.",
    "Einstein's work on general relativity revolutionized physics.",
    "Quantum mechanics emerged in the early 20th century with contributions from Planck and Heisenberg.",
    "Special relativity describes the behavior of objects moving at high speeds.",
    "The history of physics includes major discoveries by Newton and Einstein."
]
query = "What is the theory of relativity?"

# Synthetic relevance labels (1 = relevant, 0 = less relevant)
relevance_labels = {0: 1, 3: 1, 1: 1, 4: 0, 2: 0}  # Docs 0, 3, 1 relevant; Docs 4, 2 less relevant

# Step 2: Initial Retrieval with BM25
tokenized_corpus = [doc.lower().split() for doc in corpus]
tokenized_query = query.lower().split()
bm25 = BM25Okapi(tokenized_corpus)
bm25_scores = bm25.get_scores(tokenized_query)
initial_k = 3  # Initial top-k
top_k_indices = np.argsort(bm25_scores)[::-1][:initial_k]
initial_candidates = set(top_k_indices)
print("Initial BM25 Ranking:")
for idx in top_k_indices:
    print(f"Doc {idx}: {corpus[idx]} (Score: {bm25_scores[idx]:.4f})")

# Step 3: Encoding Documents and Query
model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
query_embedding = model.encode([query], convert_to_tensor=True, show_progress_bar=False)[0]
document_embeddings_np = document_embeddings.cpu().numpy()

# Ensure both tensors are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
document_embeddings = document_embeddings.to(device)
query_embedding = query_embedding.to(device)

# Step 4: Pre-compute Corpus-Wide Similarity Graph
corpus_similarity = cosine_similarity(document_embeddings_np)
print("\nPre-computed Similarity Matrix:")
for i in range(len(corpus)):
    print([f"{corpus_similarity[i, j]:.4f}" for j in range(len(corpus))])

# Step 5: GAR Iterative Re-ranking and Expansion
max_candidates = 5  # Budget for total candidates
num_iterations = 2  # Number of expansion iterations
current_candidates = initial_candidates.copy()

for iteration in range(num_iterations):
    # Select embeddings for current candidates
    current_indices = list(current_candidates)
    current_embeddings = document_embeddings[current_indices].cpu().numpy()  # Move to CPU for similarity

    # Compute similarity within current candidates
    current_similarity = cosine_similarity(current_embeddings)
    edge_index = []
    edge_weight = []
    added_pairs = set()
    k_neighbors = 2  # Number of neighbors per node
    for i in range(len(current_indices)):
        sim_scores = current_similarity[i].copy()
        sim_scores[i] = -1  # Exclude self
        top_neighbors = np.argsort(sim_scores)[::-1][:k_neighbors]
        for neighbor in top_neighbors:
            pair = tuple(sorted([i, neighbor]))
            if pair not in added_pairs:
                edge_index.append([i, neighbor])
                edge_index.append([neighbor, i])
                edge_weight.append(current_similarity[i, neighbor])
                edge_weight.append(current_similarity[i, neighbor])
                added_pairs.add(pair)

    if not edge_index:
        edge_index = torch.tensor([[i, j] for i in range(len(current_indices)) for j in range(len(current_indices)) if i != j], dtype=torch.long).t().contiguous()
        edge_weight = torch.tensor([current_similarity[i, j] for i in range(len(current_indices)) for j in range(len(current_indices)) if i != j], dtype=torch.float)
    else:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_weight = torch.tensor(edge_weight, dtype=torch.float)

    # Node features: Combine document and query embeddings
    node_features = torch.stack([document_embeddings[idx] * query_embedding for idx in current_indices], dim=0).to(device)
    graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)

    # Step 6: GNN Re-ranking
    class GNNReRanker(nn.Module):
        def __init__(self, input_dim, hidden_dim):
            super(GNNReRanker, self).__init__()
            self.conv1 = GCNConv(input_dim, hidden_dim)
            self.conv2 = GCNConv(hidden_dim, hidden_dim)
            self.scorer = nn.Linear(hidden_dim + input_dim, 1)
            self.dropout = nn.Dropout(0.5)

        def forward(self, data):
            x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
            x = F.relu(self.conv1(x, edge_index, edge_weight))
            x = self.dropout(x)
            x = F.relu(self.conv2(x, edge_index, edge_weight))
            x = self.dropout(x)
            x = torch.cat([x, data.x], dim=-1)
            scores = self.scorer(x).squeeze(-1)
            return scores

    input_dim = node_features.shape[1]
    hidden_dim = 128
    gnn_model = GNNReRanker(input_dim, hidden_dim).to(device)
    optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.01, weight_decay=1e-4)

    # Train GNN with early stopping
    gnn_model.train()
    best_loss = float('inf')
    patience = 10
    patience_counter = 0
    for epoch in range(100):
        optimizer.zero_grad()
        scores = gnn_model(graph_data)
        loss = 0
        for i in range(len(current_indices)):
            for j in range(i + 1, len(current_indices)):
                idx_i, idx_j = current_indices[i], current_indices[j]
                if idx_i in relevance_labels and idx_j in relevance_labels:
                    if relevance_labels[idx_i] > relevance_labels[idx_j]:
                        loss += F.relu(scores[j] - scores[i] + 0.1)
                    elif relevance_labels[idx_j] > relevance_labels[idx_i]:
                        loss += F.relu(scores[i] - scores[j] + 0.1)
        loss.backward()
        optimizer.step()
        if epoch % 20 == 0:
            print(f"Iter {iteration}, Epoch {epoch}, Loss: {loss.item():.4f}")
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

    # Inference
    gnn_model.eval()
    with torch.no_grad():
        gnn_scores = gnn_model(graph_data)

    # Combine with BM25 scores
    bm25_scores_current = torch.tensor([bm25_scores[idx] for idx in current_indices], dtype=torch.float).to(device)
    bm25_scores_current = torch.sigmoid(bm25_scores_current)
    gnn_scores = torch.sigmoid(gnn_scores)
    current_scores = 0.5 * bm25_scores_current + 0.5 * gnn_scores

    # Re-rank and expand
    reranked_order = torch.argsort(current_scores, descending=True)
    top_indices = [current_indices[i] for i in reranked_order[:2]]  # Top 2 for expansion

    # Expand candidates using corpus similarity
    new_candidates = set()
    for idx in top_indices:
        sim_scores = corpus_similarity[idx]
        neighbor_indices = np.argsort(sim_scores)[::-1][1:3]  # Top 2 neighbors excluding self
        for neighbor in neighbor_indices:
            if len(current_candidates) < max_candidates and neighbor not in current_candidates:
                new_candidates.add(neighbor)
    current_candidates.update(new_candidates)
    print(f"Iter {iteration + 1} Candidates: {sorted(list(current_candidates))}")

# Step 7: Final Ranking
final_indices = list(current_candidates)
final_embeddings = document_embeddings[final_indices].cpu().numpy()
final_similarity = cosine_similarity(final_embeddings)
edge_index = []
edge_weight = []
added_pairs = set()
for i in range(len(final_indices)):
    sim_scores = final_similarity[i].copy()
    sim_scores[i] = -1
    top_neighbors = np.argsort(sim_scores)[::-1][:2]
    for neighbor in top_neighbors:
        pair = tuple(sorted([i, neighbor]))
        if pair not in added_pairs:
            edge_index.append([i, neighbor])
            edge_index.append([neighbor, i])
            edge_weight.append(final_similarity[i, neighbor])
            edge_weight.append(final_similarity[i, neighbor])
            added_pairs.add(pair)

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(edge_weight, dtype=torch.float)
node_features = torch.stack([document_embeddings[idx] * query_embedding for idx in final_indices], dim=0).to(device)
final_graph = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)

gnn_model.eval()
with torch.no_grad():
    final_gnn_scores = gnn_model(final_graph)

bm25_final = torch.tensor([bm25_scores[idx] for idx in final_indices], dtype=torch.float).to(device)
bm25_final = torch.sigmoid(bm25_final)
final_gnn_scores = torch.sigmoid(final_gnn_scores)
final_scores = 0.5 * bm25_final + 0.5 * final_gnn_scores

final_ranking = torch.argsort(final_scores, descending=True)
print("\nFinal GAR Re-ranked List:")
for rank, rerank_idx in enumerate(final_ranking):
    orig_idx = final_indices[rerank_idx]
    print(f"Rank {rank+1}: Doc {orig_idx}: {corpus[orig_idx]} (Final Score: {final_scores[rerank_idx]:.4f})")

Initial BM25 Ranking:
Doc 0: The theory of relativity was developed by Albert Einstein in 1915. (Score: 1.5514)
Doc 4: The history of physics includes major discoveries by Newton and Einstein. (Score: 0.4619)
Doc 3: Special relativity describes the behavior of objects moving at high speeds. (Score: 0.4619)

Pre-computed Similarity Matrix:
['1.0000', '0.6399', '0.3947', '0.4308', '0.5990']
['0.6399', '1.0000', '0.3441', '0.3860', '0.6101']
['0.3947', '0.3441', '1.0000', '0.1448', '0.5143']
['0.4308', '0.3860', '0.1448', '1.0000', '0.3185']
['0.5990', '0.6101', '0.5143', '0.3185', '1.0000']
Iter 0, Epoch 0, Loss: 0.1989
Iter 0, Epoch 20, Loss: 0.0725
Early stopping at epoch 25
Iter 1 Candidates: [np.int64(0), np.int64(1), np.int64(3), np.int64(4)]
Iter 1, Epoch 0, Loss: 0.3001
Iter 1, Epoch 20, Loss: 0.0099
Early stopping at epoch 20
Iter 2 Candidates: [np.int64(0), np.int64(1), np.int64(3), np.int64(4)]

Final GAR Re-ranked List:
Rank 1: Doc 0: The theory of relativity was developed by 

##BM25 vs GNN vs GAR + Cosine vs GAR + Cosine + GNN

In [13]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

# Step 1: Synthetic Corpus and Query
corpus = [
    "The theory of relativity was developed by Albert Einstein in 1915.",
    "Einstein's work on general relativity revolutionized physics.",
    "Quantum mechanics emerged in the early 20th century with contributions from Planck and Heisenberg.",
    "Special relativity describes the behavior of objects moving at high speeds.",
    "The history of physics includes major discoveries by Newton and Einstein."
]
query = "What is the theory of relativity?"
relevance_labels = {0: 1, 3: 1, 1: 1, 4: 0, 2: 0}
k = 4

# Step 2: Query Augmentation with T5
def generate_augmented_queries(query, model, tokenizer, device='cpu', num_contexts=3):
    input_text = f"Generate keywords for: {query}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=50,
        num_return_sequences=num_contexts,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    augmented_queries = [query]
    for output in outputs:
        context = tokenizer.decode(output, skip_special_tokens=True)
        augmented_queries.append(f"{query} {context}")
    return augmented_queries

# Initialize T5 model
try:
    t5_model_name = "t5-base"
    t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
    t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
except Exception:
    t5_model_name = "t5-small"
    t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
    t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model = t5_model.to(device)
augmented_queries = generate_augmented_queries(query, t5_model, t5_tokenizer, device)
print("Augmented Queries:")
for aq in augmented_queries:
    print(f"- {aq}")

# Step 3: BM25 Retrieval with Augmented Queries and Hybrid Fallback
tokenized_corpus = [doc.lower().split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
retrieved_indices = set()
bm25_scores = np.zeros(len(corpus))

for aq in augmented_queries:
    tokenized_aq = aq.lower().split()
    scores = bm25.get_scores(tokenized_aq)
    top_indices = np.argsort(scores)[::-1][:k]
    for idx in top_indices:
        retrieved_indices.add(idx)
        bm25_scores[idx] = max(bm25_scores[idx], scores[idx])

# Hybrid retrieval
model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
query_embedding = model.encode([query], convert_to_tensor=True, show_progress_bar=False)[0]
cosine_scores = cosine_similarity(query_embedding.unsqueeze(0).cpu().numpy(),
                                 document_embeddings.cpu().numpy())[0]
print("\nRaw BM25 and Cosine Scores for All Documents:")
for idx in range(len(corpus)):
    print(f"Doc {idx}: BM25={bm25_scores[idx]:.4f}, Cosine={cosine_scores[idx]:.4f}")

# Normalize and combine scores
bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
cosine_norm = (cosine_scores - cosine_scores.min()) / (cosine_scores.max() - cosine_scores.min() + 1e-10)
combined_scores = 0.5 * bm25_norm + 0.5 * cosine_norm
top_k_indices = np.argsort(combined_scores)[::-1][:k]
initial_ranking = [(idx, corpus[idx], combined_scores[idx], bm25_scores[idx], cosine_scores[idx]) for idx in top_k_indices]
print("\nInitial Combined Ranking (GAR + Cosine):")
for idx, doc, comb_score, bm25_score, cos_score in initial_ranking:
    print(f"Doc {idx}: {doc} (Combined: {comb_score:.4f}, BM25: {bm25_score:.4f}, Cosine: {cos_score:.4f})")

# Step 4: Graph Construction
document_embeddings_np = document_embeddings.cpu().numpy()
selected_embeddings_np = document_embeddings_np[top_k_indices].copy()
print("\nSelected embeddings strides:", selected_embeddings_np.strides)
similarity_matrix = cosine_similarity(selected_embeddings_np)
print("\nSimilarity Matrix:")
for i in range(k):
    print([f"{similarity_matrix[i, j]:.4f}" for j in range(k)])

# k-NN (2 neighbors) for edges
edge_index = []
edge_weight = []
added_pairs = set()
for i in range(k):
    sim_scores = similarity_matrix[i].copy()
    sim_scores[i] = -1
    top_neighbors = np.argsort(sim_scores)[::-1][:2]
    for neighbor in top_neighbors:
        pair = tuple(sorted([i, neighbor]))
        if pair not in added_pairs:
            edge_index.append([i, neighbor])
            edge_index.append([neighbor, i])
            edge_weight.append(similarity_matrix[i, neighbor])
            edge_weight.append(similarity_matrix[i, neighbor])
            added_pairs.add(pair)

if not edge_index:
    print("Warning: No edges formed. Using dummy edge.")
    edge_index = torch.tensor([[0, 0]], dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor([1.0], dtype=torch.float)
else:
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
print("\nEdges formed:", edge_index.t().tolist())

# Node features
node_features = []
for idx in top_k_indices:
    doc_query_feature = document_embeddings[idx] * query_embedding
    cosine_feature = cosine_scores[idx]
    feature = np.concatenate([doc_query_feature.cpu().numpy(), [cosine_feature]])
    node_features.append(feature)
node_features = torch.tensor(node_features, dtype=torch.float)
graph_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)

# Step 5: GNN for Re-ranking
class GNNReRanker(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNNReRanker, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.scorer = nn.Linear(hidden_dim + input_dim, 1)
        self.dropout = nn.Dropout(0.6)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = torch.cat([x, data.x], dim=-1)
        scores = self.scorer(x).squeeze(-1)
        return scores

# Initialize GNN
input_dim = node_features.shape[1]
hidden_dim = 128
gnn_model = GNNReRanker(input_dim, hidden_dim)
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.01, weight_decay=1e-3)

# Step 6: Train GNN
gnn_model.train()
best_loss = float('inf')
patience = 10
patience_counter = 0
min_loss_threshold = 1e-4
for epoch in range(200):
    optimizer.zero_grad()
    scores = gnn_model(graph_data)
    loss = 0
    for i in range(k):
        for j in range(i + 1, k):
            idx_i, idx_j = top_k_indices[i], top_k_indices[j]
            if idx_i in relevance_labels and idx_j in relevance_labels:
                if relevance_labels[idx_i] > relevance_labels[idx_j]:
                    loss += F.relu(scores[j] - scores[i] + 0.2)  # Increased margin
                elif relevance_labels[idx_j] > relevance_labels[idx_i]:
                    loss += F.relu(scores[i] - scores[j] + 0.2)
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    if loss.item() < best_loss and loss.item() > min_loss_threshold:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

# Step 7: Inference
gnn_model.eval()
with torch.no_grad():
    gnn_scores = gnn_model(graph_data)
print("\nRaw GNN Scores:")
for idx, score in zip(top_k_indices, gnn_scores):
    print(f"Doc {idx}: {score:.4f}")

# Step 8: Combine Scores
bm25_top_k = torch.tensor([bm25_scores[idx] for idx in top_k_indices], dtype=torch.float)
bm25_top_k = torch.sigmoid(bm25_top_k)
gnn_scores = torch.sigmoid(gnn_scores)
final_scores = 0.3 * bm25_top_k + 0.7 * gnn_scores
print("\nFinal Score Components:")
for idx, bm25_s, gnn_s, final_s in zip(top_k_indices, bm25_top_k, gnn_scores, final_scores):
    print(f"Doc {idx}: BM25_Sigmoid={bm25_s:.4f}, GNN_Sigmoid={gnn_s:.4f}, Final={final_s:.4f}")

# Step 9: Final Re-ranking
reranked_indices = torch.argsort(final_scores, descending=True)
print("\nFinal Re-ranked List (GAR + Cosine + GNN):")
for rank, rerank_idx in enumerate(reranked_indices):
    orig_idx = top_k_indices[rerank_idx]
    print(f"Rank {rank+1}: Doc {orig_idx}: {corpus[orig_idx]} (Final Score: {final_scores[rerank_idx]:.4f})")

Augmented Queries:
- What is the theory of relativity?
- What is the theory of relativity? for: for: What is the theory of relativity? Generate keyword generators for: What is the theory of relativity?
- What is the theory of relativity? for: What are the keywords for: What is the theory of relativity?
- What is the theory of relativity? for: I want to know a generic term for: What is the theory of relativity?

Raw BM25 and Cosine Scores for All Documents:
Doc 0: BM25=4.6543, Cosine=0.7309
Doc 1: BM25=0.0000, Cosine=0.5700
Doc 2: BM25=0.6164, Cosine=0.2514
Doc 3: BM25=1.3857, Cosine=0.5889
Doc 4: BM25=1.3857, Cosine=0.4414

Initial Combined Ranking (GAR + Cosine):
Doc 0: The theory of relativity was developed by Albert Einstein in 1915. (Combined: 1.0000, BM25: 4.6543, Cosine: 0.7309)
Doc 3: Special relativity describes the behavior of objects moving at high speeds. (Combined: 0.5008, BM25: 1.3857, Cosine: 0.5889)
Doc 4: The history of physics includes major discoveries by Newton and E