<a href="https://colab.research.google.com/github/07rudrajoshi-pixel/Rudra_Joshi-2025A2PS1480H-/blob/main/IEEE_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q sentence-transformers faiss-cpu ranx rank-bm25 fastapi uvicorn datasets tqdm

In [None]:
import os
import json
import random
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
seed=42
random.seed(seed)
np.random.seed(seed)

In [None]:
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed",exist_ok=True)

# LOADING DATA

In [None]:
from datasets import load_dataset

os.makedirs("data/processed", exist_ok=True)
os.makedirs("data/raw", exist_ok=True)

dataset = load_dataset("ms_marco", "v1.1", split="train")
dataset = dataset.shuffle(seed=42).select(range(50_000))

In [None]:
import json
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from tqdm import tqdm
import os

In [None]:
dataset[:2]

In [None]:
print(len(dataset))

In [None]:
print("Each row has a format")
print("the format is ")
print("   'passages':")
print("   'passage_text'")
print("   'url'")
print("   'query:'")
print("   'query_id':")
print("   'query_type'")

In [None]:
dataset.features

In [None]:
dataset.shape

In [None]:
print("choosing random 50K rows to work on ")
dataset = dataset.shuffle(seed=42).select(range(50000))
print(dataset.shape)
print("50K rows choosen randomly")

In [None]:
documents = []
doc_id = 0

for row in dataset:
    for passage, is_rel in zip(
        row["passages"]["passage_text"],
        row["passages"]["is_selected"]
    ):
        documents.append({
            "doc_id": f"d{doc_id}",
            "text": passage,
            "is_relevant": int(is_rel),
            "query": row["query"]
        })
        doc_id += 1

with open("data/processed/documents.json", "w") as f:
    json.dump(documents, f, indent=2)

In [None]:
print("Total documents: ",len(documents))

In [None]:
with open("data/processed/documents.json", "w") as f:
    json.dump(documents, f, indent=2)

In [None]:
print("We actually need query, positive documents and negative documents")
print("positve documents have is_selected==1")
print("negative documents have is_selected==0")

In [None]:
print("Generating required triplet")
train_triplets=[]
for row in tqdm(dataset):
  query=row['query']
  passages=row['passages']

  positives=[
      p for p, s in zip(passages["passage_text"], passages["is_selected"])
       if s==1
  ]

  negatives=[
      p for p, s in zip(passages["passage_text"], passages["is_selected"])
       if s==0
  ]
  if positives and negatives:
    train_triplets.append({
        "query":query,
        "positive":positives[0],
        "negative":random.choice(negatives)
    })

In [None]:
print("Triplets generated")
print(f"Training triplets: {len(train_triplets)}")

In [None]:
random.shuffle(train_triplets)

split_idx = int(0.9 * len(train_triplets))

train_triplets_split = train_triplets[:split_idx]
val_triplets_split   = train_triplets[split_idx:]

with open("data/processed/train_triplets.json", "w") as f:
    json.dump(train_triplets_split, f, indent=2)

with open("data/processed/val_triplets.json", "w") as f:
    json.dump(val_triplets_split, f, indent=2)

print("Train triplets:", len(train_triplets_split))
print("Val triplets:", len(val_triplets_split))

In [None]:
with open("data/processed/train_triplet.json", "w") as f:
    json.dump(train_triplets, f, indent=2)


import random


random.shuffle(train_triplets)


split_idx = int(len(train_triplets) * 0.9)

train_examples = []
val_examples = []

for i, triplet in enumerate(train_triplets):

    example = [triplet["query"], triplet["positive"]]
    if i < split_idx:
        train_examples.append(example)
    else:
        val_examples.append(example)


with open("data/processed/train_pairs.json", "w") as f:
    json.dump(train_examples, f, indent=2)


with open("data/processed/val_pairs.json", "w") as f:
    json.dump(val_examples, f, indent=2)

print(f"Generated {len(train_examples)} training pairs and {len(val_examples)} validation pairs.")

In [None]:

test_size=100
test_dataset=dataset.shuffle(seed=42).select(range(test_size))

In [None]:
queries={}
for i, row  in enumerate(test_dataset):
  queries[str(i)]=row['query']

In [None]:
import json
import os

os.makedirs("data/processed", exist_ok=True)


with open("data/processed/queries.json", "w") as f:
    json.dump(queries, f, indent=2)

print("Queries saved to data/processed/queries.json")

Queries saved to data/processed/queries.json


In [None]:
print("Loading triplets")
def load_triplets(path):
    with open(path, "r") as f:
        data = json.load(f)

    examples = []
    for item in data:
        examples.append(
            InputExample(
                texts=[
                    item["query"],
                    item["positive"],
                    item["negative"]
                ]
            )
        )
    return examples

print("Triplets loaded")

Loading triplets
Triplets loaded


In [None]:
train_examples = load_triplets("data/processed/train_triplets.json")
val_examples   = load_triplets("data/processed/val_triplets.json")

print(f"Train triplets: {len(train_examples)}")
print(f"Validation triplets: {len(val_examples)}")

Train triplets: 43578
Validation triplets: 4842


In [None]:
with open("data/processed/train_triplets.json", "w") as f:
    json.dump(train_triplets_split, f, indent=2)

with open("data/processed/val_triplets.json", "w") as f:
    json.dump(val_triplets_split, f, indent=2)

In [None]:
import json
import os


output_corpus_path = "data/processed/corpus.json"

os.makedirs("data/processed", exist_ok=True)

with open("data/processed/documents.json", "r") as f:
    documents_list = json.load(f)

corpus = {}
for doc in documents_list:
    corpus[doc["doc_id"]] = doc["text"]

with open(output_corpus_path, "w") as f:
    json.dump(corpus, f, indent=2)

print(f"Corpus saved with {len(corpus)} documents to {output_corpus_path}")

Corpus saved with 286033 documents to data/processed/corpus.json


In [None]:
def clean(text):
    text = text.lower()
    text = text.replace("\n", " ")
    return text.strip()
print("Data cleaned")

Data cleaned


# MODEL TRAINING

In [None]:
print("Intialsing Encoder Model ")
model_name = "sentence-transformers/all-MiniLM-L6-v2"

model = SentenceTransformer(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print("Model device:", device)



Intialsing Encoder Model 


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model device: cpu


In [None]:
print("Dataloaders Creation")
train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=16
)

val_dataloader = DataLoader(
    val_examples,
    shuffle=False,
    batch_size=16
)

Dataloaders Creation


In [None]:
train_loss = losses.TripletLoss(
    model=model,
    distance_metric=losses.TripletDistanceMetric.COSINE,
    triplet_margin=0.3
)
print(f"Model Loaded on {model.device}")
print("Loss Function Initialised")

Model Loaded on cpu
Loss Function Initialised


# FINE TUNING

In [None]:
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

output_path = "models/fine_tuned_retriever"
os.makedirs(output_path, exist_ok=True)
print("Fine tuning started")
os.environ["WANDB_DISABLED"] = "true"

Fine tuning started


In [None]:
print("Training the model")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=output_path,
    show_progress_bar=True
)
print("Model Trained")

Training the model


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



KeyboardInterrupt: 

In [None]:
model.save(output_path)
print("Fine-tuned model saved at:", output_path)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Fine-tuned model saved at: models/fine_tuned_retriever


In [None]:
print('gpu check')
assert torch.cuda.is_available(), "GPU is required for fine-tuning"

device = "cuda"
print("Using GPU:", torch.cuda.get_device_name(0))

gpu check


AssertionError: GPU is required for fine-tuning

In [None]:
TRAIN_TRIPLETS_PATH = "data/processed/train_triplets.json"
VAL_TRIPLETS_PATH = "data/processed/val_triplets.json"

with open(TRAIN_TRIPLETS_PATH) as f:
    train_triplets = json.load(f)

with open(VAL_TRIPLETS_PATH) as f:
    val_triplets = json.load(f)

print(f"Train triplets: {len(train_triplets)}")
print(f"Validation triplets: {len(val_triplets)}")

Train triplets: 43578
Validation triplets: 4842


In [None]:
train_examples = [
    InputExample(
        texts=[t["query"], t["positive"], t["negative"]]
    )
    for t in train_triplets
]
print("Triplets converted to SentenceTransfprmer format") # required for triplet loss

In [None]:
BATCH_SIZE = 32

train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=BATCH_SIZE,
    drop_last=True
)

In [None]:
model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device=device
)

print("Model loaded on:", model.device)
print("Model loaded")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded on: cpu
Model loaded


In [None]:
train_loss = losses.TripletLoss(
    model=model,
    distance_metric=losses.TripletDistanceMetric.COSINE,
    triplet_margin=0.3
)
print("Triplet loss defined")

Triplet loss defined


In [None]:
NUM_EPOCHS = 2

warmup_steps = int(
    len(train_dataloader) * NUM_EPOCHS * 0.1
)

OUTPUT_PATH = "models/fine_tuned_dense_retriever"
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [None]:
print("Starting fine-tuning")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=NUM_EPOCHS,
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    show_progress_bar=True
)

print("Fine-tuning completed")

In [None]:
model.save(OUTPUT_PATH)
print(f"Model saved to {OUTPUT_PATH}")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved to models/fine_tuned_dense_retriever


# FAISS RETRIEVAL

In [None]:
import time
import faiss
import numpy as np
import torch

from sentence_transformers import SentenceTransformer

In [None]:
print("Loading Model")
MODEL_PATH = "models/fine_tuned_dense_retriever"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer(MODEL_PATH, device=device)
model.eval()#disables dropouts

In [None]:
print("Fine tuned model loaded")

In [None]:
import time
import faiss
import numpy as np
import torch
import os
import json
from sentence_transformers import SentenceTransformer


os.makedirs("data/embeddings", exist_ok=True)


with open("data/processed/corpus.json", "r") as f:
    corpus = json.load(f)

doc_ids = list(corpus.keys())
corpus_texts = [corpus[doc_id] for doc_id in doc_ids]


embeddings_path = "data/embeddings/corpus_embeddings.npy"
if not os.path.exists(embeddings_path):
    print("Generating corpus embeddings...")
    model.eval()
    with torch.no_grad():
        corpus_embeddings = model.encode(corpus_texts, convert_to_numpy=True, show_progress_bar=True)
    np.save(embeddings_path, corpus_embeddings)
    print(f"Corpus embeddings saved to {embeddings_path}")
else:
    print(f"Corpus embeddings already exist at {embeddings_path}. Loading them.")


corpus_embeddings = np.load(embeddings_path)

print("loaded corpus and embeddings")

Generating corpus embeddings...


KeyboardInterrupt: 

In [None]:
print("Normalising Embedding -needed for cosine similarity ")
faiss.normalize_L2(corpus_embeddings)

embedding_dim = corpus_embeddings.shape[1]
print("Embedding dimension:", embedding_dim)

In [None]:
index = faiss.IndexFlatIP(embedding_dim)
index.add(corpus_embeddings)

print("FAISS index built")
print("Number of documents indexed:", index.ntotal)

In [None]:
with open("data/processed/queries.json", "r") as f:
    queries = json.load(f)

query_ids = list(queries.keys())
query_texts = [queries[qid] for qid in query_ids]

In [None]:
import faiss
import numpy as np
import os

os.makedirs("faiss_index", exist_ok=True)

embeddings = np.load("data/embeddings/corpus_embeddings.npy")
print("Embeddings shape:", embeddings.shape)

dim = embeddings.shape[1]


index = faiss.IndexFlatIP(dim)


index.add(embeddings)

print("Total vectors indexed:", index.ntotal)


faiss.write_index(index, "faiss_index/index.faiss")
print("FAISS index saved to faiss_index/index.faiss")

In [None]:
def encode_queries(queries, batch_size=32):
    embeddings = []

    for i in range(0, len(queries), batch_size):
        batch = queries[i:i + batch_size]
        with torch.no_grad():
            emb = model.encode(
                batch,
                convert_to_numpy=True,
                show_progress_bar=False
            )
        embeddings.append(emb)

    embeddings = np.vstack(embeddings)
    faiss.normalize_L2(embeddings)
    return embeddings


query_embeddings = encode_queries(query_texts)
print("Query embeddings generated")

In [None]:
TOP_K = 10

start_time = time.time()

scores, indices = index.search(query_embeddings, TOP_K)

end_time = time.time()

print(f"FAISS search completed in {end_time - start_time:.3f} seconds")

In [None]:
retrieval_results = {}

for q_idx, qid in enumerate(query_ids):
    retrieved_docs = []

    for rank, doc_idx in enumerate(indices[q_idx]):
        doc_id = doc_ids[doc_idx]
        score = float(scores[q_idx][rank])

        retrieved_docs.append({
            "doc_id": doc_id,
            "score": score,
            "rank": rank + 1
        })

    retrieval_results[qid] = retrieved_docs

In [None]:
os.makedirs("results", exist_ok=True)

with open("results/faiss_results.json", "w") as f:
    json.dump(retrieval_results, f, indent=2)

print("FAISS retrieval results saved")

# EVALUATION METRICS

In [None]:
# Loading corpus
with open("data/processed/corpus.json") as f:
    corpus = json.load(f)

doc_ids = list(corpus.keys())

# Loading test queries
queries_file_path = "data/processed/queries.json"
with open(queries_file_path) as f:
    queries = json.load(f)

with open("data/processed/documents.json", "r") as f:
    all_documents = json.load(f)

qrels = {}

query_text_to_id = {text: qid for qid, text in queries.items()}

for doc in all_documents:
    if doc["is_relevant"] == 1:
        query_text = doc["query"]

        if query_text in query_text_to_id:
            query_id = query_text_to_id[query_text]
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc["doc_id"]] = 1



with open("data/processed/qrels.json", "w") as f:
    json.dump(qrels, f, indent=2)

print(f"Corpus size: {len(corpus)}")
print(f"Test queries: {len(queries)}")
print(f"QRELs loaded for {len(qrels)} queries")

In [None]:
index = faiss.read_index("faiss_index/index.faiss")
print("FAISS index loaded")
print("Indexed vectors:", index.ntotal)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "models/fine_tuned_dense_retriever", # Corrected model path
    device="cuda"
)

In [None]:
query_texts = [queries[qid] for qid in queries]

query_embeddings = model.encode(
    query_texts,
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

In [None]:
TOP_K = 10

distances, indices = index.search(query_embeddings, TOP_K)

retrieved_docs = {}

for i, qid in enumerate(queries.keys()):
    retrieved_docs[qid] = [doc_ids[idx] for idx in indices[i]]

In [None]:
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    rel_set = set(relevant)
    return len(set(retrieved_k) & rel_set) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    rel_set = set(relevant)
    return len(set(retrieved_k) & rel_set) / len(rel_set) if rel_set else 0

def reciprocal_rank(retrieved, relevant):
    rel_set = set(relevant)
    for i, doc_id in enumerate(retrieved):
        if doc_id in rel_set:
            return 1 / (i + 1)
    return 0

def average_precision(retrieved, relevant):
    rel_set = set(relevant)
    score = 0.0
    hits = 0

    for i, doc_id in enumerate(retrieved):
        if doc_id in rel_set:
            hits += 1
            score += hits / (i + 1)

    return score / len(rel_set) if rel_set else 0

def ndcg_at_k(retrieved, relevant, k):
    dcg = 0.0
    for i, doc_id in enumerate(retrieved[:k]):
        if doc_id in relevant:
            dcg += 1 / np.log2(i + 2)

    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
    return dcg / idcg if idcg > 0 else 0

def hit_at_k(retrieved, relevant, k):
    return int(any(doc in relevant for doc in retrieved[:k]))

In [None]:
from collections import defaultdict

metrics = defaultdict(list)

for qid in tqdm(queries.keys()):
    # Use retrieval_results which is a dictionary from qid to list of dictionaries
    # Extract only the doc_ids for evaluation
    retrieved_for_query = retrieval_results[qid]
    retrieved_doc_ids = [item['doc_id'] for item in retrieved_for_query] # This will be a list of strings

    relevant = qrels.get(qid, {}).keys()

    metrics["P@5"].append(precision_at_k(retrieved_doc_ids, relevant, 5))
    metrics["R@5"].append(recall_at_k(retrieved_doc_ids, relevant, 5))
    metrics["MRR"].append(reciprocal_rank(retrieved_doc_ids, relevant))
    metrics["MAP"].append(average_precision(retrieved_doc_ids, relevant))
    metrics["nDCG@10"].append(ndcg_at_k(retrieved_doc_ids, relevant, 10))
    metrics["Hit@10"].append(hit_at_k(retrieved_doc_ids, relevant, 10))

In [None]:
print("FINAL RESULTS")

for metric, values in metrics.items():
    print(f"{metric}: {np.mean(values):.4f}")