In [1]:
%pip install datasets sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install openai

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Create your API token from your Hugging Face Account. Make sure to save it in text file or notepad for future use.
# Will need to add it once per section
from huggingface_hub import login
login()

In [8]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from typing import List, Optional
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI  # Added for OpenAI embeddings
import numpy as np
from datasets import load_dataset
import time
import torch
import os
from tqdm import tqdm
from dotenv import load_dotenv
from torch.utils.data import DataLoader
import random

class TextSimilarityModel:
    def __init__(self, corpus_name, rel_name, model_name='all-MiniLM-L6-v2', top_k=10):
        """
        Initialize the model with datasets and pre-trained sentence transformer.
        """
        load_dotenv()
        self.model = SentenceTransformer(model_name)

        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"Current GPU: {torch.cuda.get_device_name(0)}")
            self.model.to('cuda')
        else:
            print("Using CPU")
        
        self.corpus_name = corpus_name
        self.rel_name = rel_name
        self.top_k = top_k
        self.session_state = {}
        self.load_data()


    def load_data(self):
        """
        Load and filter datasets based on test queries and documents.
        """
        # Load query and document datasets
        dataset_queries = load_dataset(self.corpus_name, "queries")
        dataset_docs = load_dataset(self.corpus_name, "corpus")

        # Extract queries and documents
        self.queries = dataset_queries["queries"]["text"]
        self.query_ids = dataset_queries["queries"]["_id"]
        self.documents = dataset_docs["corpus"]["text"]
        self.document_ids = dataset_docs["corpus"]["_id"]

                
        # Filter queries and documents and build relevant queries and documents mapping based on test set
        test_qrels = load_dataset(self.rel_name)["test"]
        self.filtered_test_query_ids = set(test_qrels["query-id"])
        self.filtered_test_doc_ids = set(test_qrels["corpus-id"])

        self.test_queries = [q for qid, q in zip(self.query_ids, self.queries) if qid in self.filtered_test_query_ids]
        self.test_query_ids = [qid for qid in self.query_ids if qid in self.filtered_test_query_ids]
        self.test_documents = [doc for did, doc in zip(self.document_ids, self.documents) if did in self.filtered_test_doc_ids]
        self.test_document_ids = [did for did in self.document_ids if did in self.filtered_test_doc_ids]

        self.test_query_id_to_relevant_doc_ids = {qid: [] for qid in self.test_query_ids}
        for qid, doc_id in zip(test_qrels["query-id"], test_qrels["corpus-id"]):
            if qid in self.test_query_id_to_relevant_doc_ids:
                self.test_query_id_to_relevant_doc_ids[qid].append(doc_id)
                
        ## Code Below this is used for creating the training set 
        # Build query and document id to text mapping
        self.query_id_to_text = {query_id:query for query_id, query in zip(self.query_ids, self.queries)}
        self.document_id_to_text = {document_id:document for document_id, document in zip(self.document_ids, self.documents)}

        # Build relevant queries and documents mapping based on train set
        train_qrels = load_dataset(self.rel_name)["train"]
        self.train_query_id_to_relevant_doc_ids = {qid: [] for qid in train_qrels["query-id"]}

        for qid, doc_id in zip(train_qrels["query-id"], train_qrels["corpus-id"]):
            if qid in self.train_query_id_to_relevant_doc_ids:
                # Append the document ID to the relevant doc mapping
                self.train_query_id_to_relevant_doc_ids[qid].append(doc_id)
        
        # Filter queries and documents and build relevant queries and documents mapping based on validation set  
        #TODO Put your code here. 
         ###########################################################################
        val_qrels = load_dataset(self.rel_name)["validation"]
        self.filtered_val_query_ids = set(val_qrels["query-id"])
        self.filtered_val_doc_ids = set(val_qrels["corpus-id"])        

        self.val_queries = [q for qid, q in zip(self.query_ids, self.queries) if qid in self.filtered_val_query_ids]
        self.val_query_ids = [qid for qid in self.query_ids if qid in self.filtered_val_query_ids]
        self.val_documents = [doc for did, doc in zip(self.document_ids, self.documents) if did in self.filtered_val_doc_ids]
        self.val_document_ids = [did for did in self.document_ids if did in self.filtered_val_doc_ids]

        self.val_query_id_to_relevant_doc_ids = {qid: [] for qid in self.val_query_ids}
        for qid, doc_id in zip(val_qrels["query-id"], val_qrels["corpus-id"]):
            if qid in self.val_query_id_to_relevant_doc_ids:
                self.val_query_id_to_relevant_doc_ids[qid].append(doc_id)
        ###########################################################################
        

    #Task 1: Encode Queries and Documents (10 Pts)

    def encode_with_glove(self, glove_file_path: str, sentences: list[str]) -> list[np.ndarray]:

        """
        # Inputs:
            - glove_file_path (str): Path to the GloVe embeddings file (e.g., "glove.6B.50d.txt").
            - sentences (list[str]): A list of sentences to encode.

        # Output:
            - list[np.ndarray]: A list of sentence embeddings 
            
        (1) Encodes sentences by averaging GloVe 50d vectors of words in each sentence.
        (2) Return a sequence of embeddings of the sentences.
        Download the glove vectors from here. 
        https://nlp.stanford.edu/data/glove.6B.zip
        Handle unknown words by using zero vectors
        """
        #TODO Put your code here. 
        ###########################################################################
        word_to_vec = {}
        embedding_dim = 50
    
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                word_to_vec[word] = vector  
        
        sentence_embeddings = []
        for sentence in tqdm(sentences, desc="GloVe encoding"):
            words = sentence.lower().split()
            
            vectors = []
            for word in words:
                if word in word_to_vec:
                    vectors.append(word_to_vec[word])
                else:
                    vectors.append(np.zeros(embedding_dim))
            
            if len(vectors) > 0:
                avg_vec = np.mean(vectors, axis=0)
            else:
                avg_vec = np.zeros(embedding_dim)
                
            sentence_embeddings.append(avg_vec)

        return sentence_embeddings
        ###########################################################################
    

    def encode_with_openai(
        self,
        sentences: List[str], 
        model: str = 'text-embedding-3-small',
        api_key: Optional[str] = None,
        batch_size: int = 100
    ) -> np.ndarray:
        """
        Encodes sentences using OpenAI's embedding API.
        
        # Inputs:
            - sentences (List[str]): A list of sentences to encode.
            - model (str): OpenAI model name. Options:
                * 'text-embedding-3-small' (1536 dims, $0.02/1M tokens) - RECOMMENDED
                * 'text-embedding-3-large' (3072 dims, $0.13/1M tokens)
                * 'text-embedding-ada-002' (1536 dims, legacy)
            - api_key (str, optional): OpenAI API key. If None, reads from OPENAI_API_KEY env variable
            - batch_size (int): Number of sentences to encode per API call (max 2048)
            
        Instructions:
        - Implement batched encoding with error handling
        - Add rate limiting (sleep between batches)
        
        Expected Cost for this Assignment:
        - ~4,000 texts (320 queries + 3,600 documents)
        - text-embedding-3-small: ~$0.08-0.10 per student
        - text-embedding-3-large: ~$0.50-0.65 per student
        
        Tips:
        - Use try-except for API errors
        - Implement retry logic with exponential backoff
        - Cache embeddings to avoid re-encoding
        - Monitor your usage at: https://platform.openai.com/usage
        """
        #TODO Put your code here.
        ###########################################################################
        cache_key = "cat_embed_" + "openai" + "_" + model
        if cache_key not in self.session_state:
            self.session_state[cache_key] = {}
        if api_key is None:
            print("Getting key from os")
            api_key = os.getenv("OPENAI_API_KEY")

        input = list(set(s for s in sentences if s not in self.session_state[cache_key]))
        openai_embedding = []

        if not api_key:
            raise Exception("openai api key not exist")
        
        if input:
            client = OpenAI(api_key = api_key)
            try:
                pbar = tqdm(range(0, len(input), batch_size), desc="OpenAI API requesting")
                for i in pbar:
                    batch = input[i : i + batch_size]
                    response = client.embeddings.create(
                        input = batch,
                        model = model
                    )
                    results = [data.embedding for data in response.data]
                    
                    for text, embedding_result in zip(batch, results):
                        self.session_state[cache_key][text] = embedding_result
                    time.sleep(0.5)
            except Exception as e:
                print(f"Error getting OpenAI embeddings: {e}")
                
        dim = 3072 if model == 'text-embedding-3-large' else 1536
        
        for s in sentences:
            if s in self.session_state[cache_key]:
                openai_embedding.append(self.session_state[cache_key][s])
            else:
                openai_embedding.append(np.zeros(dim))
                
        return np.array(openai_embedding)
        ###########################################################################

    #Task 2: Calculate Cosine Similarity and Rank Documents (20 Pts)
    
    def rank_documents(self, encoding_method: str = 'sentence_transformer') -> None:
        """
         # Inputs:
            - encoding_method (str): The method used for encoding queries/documents. 
                             Options: ['glove', 'sentence_transformer'].

        # Output:
            - None (updates self.query_id_to_ranked_doc_ids with ranked document IDs).
    
        (1) Compute cosine similarity between each document and the query
        (2) Rank documents for each query and save the results in a dictionary "query_id_to_ranked_doc_ids" 
            This will be used in "mean_average_precision"
            Example format {2: [125, 673], 35: [900, 822]}
        """
        if encoding_method == 'glove':
            # Note: Ensure "glove.6B.50d.txt" is downloaded and in the local directory
            query_embeddings = self.encode_with_glove("glove.6B.50d.txt", self.queries)
            document_embeddings = self.encode_with_glove("glove.6B.50d.txt", self.documents)
        elif encoding_method == 'sentence_transformer':
            query_embeddings = self.model.encode(self.queries, show_progress_bar=True)
            document_embeddings = self.model.encode(self.documents, show_progress_bar=True)
        elif encoding_method == 'openai':
            # Use environment variable or prompt for API key
            query_embeddings = self.encode_with_openai(self.queries)
            document_embeddings = self.encode_with_openai(self.documents)
        else:
            raise ValueError("Invalid encoding method. Choose 'glove' or 'sentence_transformer'.")
        
        
        #TODO Put your code here.
        ###########################################################################
         # define a dictionary to store the ranked documents for each query
        self.query_id_to_ranked_doc_ids = {}
        similarities = cosine_similarity(query_embeddings, document_embeddings)
        print("cosine similarity finish")

        query_id_to_idx = {query_id: idx for idx, query_id in enumerate(self.query_ids)}
        for query_id in tqdm(self.test_query_ids, desc=f"Ranking ({encoding_method})"):
            if query_id in query_id_to_idx:
                # i query score to all documents
                row_idx = query_id_to_idx[query_id]
                query_scores = similarities[row_idx]
                
                # argsort sort ascending，[::-1] to revert
                ranked_indices = np.argsort(query_scores)[::-1]
                
                # track document ID
                ranked_doc_ids = [self.document_ids[idx] for idx in ranked_indices]
                
                # save
                self.query_id_to_ranked_doc_ids[query_id] = ranked_doc_ids

        print("Task 2 Finish")
      
        ###########################################################################

    @staticmethod
    def average_precision(relevant_docs: list[str], candidate_docs: list[str]) -> float:
        """
        # Inputs:
            - relevant_docs (list[str]): A list of document IDs that are relevant to the query.
            - candidate_docs (list[str]): A list of document IDs ranked by the model.

        # Output:
            - float: The average precision score
    
        Compute average precision for a single query.
        """
        y_true = [1 if doc_id in relevant_docs else 0 for doc_id in candidate_docs]
        precisions = [np.mean(y_true[:k+1]) for k in range(len(y_true)) if y_true[k]]
        return np.mean(precisions) if precisions else 0

    #Task 3: Calculate Evaluate System Performance (10 Pts)
    
    def mean_average_precision(self) -> float:
        """
        # Inputs:
            - None (uses ranked documents stored in self.query_id_to_ranked_doc_ids).

        # Output:
            - float: The MAP score, computed as the mean of all average precision scores.
    
        (1) Compute mean average precision for all queries using the "average_precision" function.
        (2) Compute the mean of all average precision scores
        Return the mean average precision score
        
        reference: https://www.evidentlyai.com/ranking-metrics/mean-average-precision-map
        https://towardsdatascience.com/map-mean-average-precision-might-confuse-you-5956f1bfa9e2
        """
         #TODO Put your code here. 
        ###########################################################################
        ap_scores = []
        
        for query_id in self.test_query_ids:
            ranked_docs = self.query_id_to_ranked_doc_ids.get(query_id, [])
            
            relevant_docs = self.test_query_id_to_relevant_doc_ids.get(query_id, [])
            
            ap = self.average_precision(relevant_docs, ranked_docs)
            ap_scores.append(ap)

        if not ap_scores:
            return 0.0
            
        return sum(ap_scores) / len(ap_scores)
        ###########################################################################
    
    #Task 4: Ranking the Top 10 Documents based on Similarity Scores (10 Pts)

    def show_ranking_documents(self, encoding_method: str, example_query: str) -> None:
                
        """
        # Inputs:
            - example_query (str): A query string for which top-ranked documents should be displayed.

        # Output:
            - None (prints the ranked documents along with similarity scores).
        
        (1) rank documents with given query with cosine similarity scores
        (2) prints the top 10 results along with its similarity score.
        
        """
        #TODO Put your code here. 

        ###########################################################################
      
        # 1. Encode the single query based on the method
        # 2. Reshape check: Ensure query_embedding is (1, n_features)
        # 3. Calculate scores
        
        if encoding_method == 'glove':
            query_embedding = self.encode_with_glove("glove.6B.50d.txt", [example_query])
            doc_embs = self.encode_with_glove("glove.6B.50d.txt", self.documents)
        elif encoding_method == 'sentence_transformer':
            query_embedding = self.model.encode([example_query])
            doc_embs = self.model.encode(self.documents)
        elif encoding_method == 'openai':
            query_embedding = self.encode_with_openai([example_query])
            doc_embs = self.encode_with_openai(self.documents)
        else:
            print("Invalid encoding method.")
            return

        query_embedding = np.array(query_embedding).reshape(1, -1)            

        scores = cosine_similarity(query_embedding, doc_embs)[0]
        top_10_indices = np.argsort(scores)[::-1][:10]

        print(f"\n--- Top 10 Results for Query (Method: {encoding_method}) ---")
        print(f"Query: {example_query}\n")

        for i, idx in enumerate(top_10_indices):
            doc_id = self.document_ids[idx]
            score = scores[idx]
            text_snippet = self.document_id_to_text.get(doc_id, "No text available")[:150]
            
            print(f"Rank {i+1} | Score: {score:.4f} | ID: {doc_id}")
            print(f"Snippet: {text_snippet}...")
            print("-" * 30)
        ###########################################################################
      
    #Task 5:Fine tune the sentence transformer model (25 Pts)
    # Students are not graded on achieving a high MAP score. 
    # The key is to show understanding, experimentation, and thoughtful analysis.
    
    def fine_tune_model(self, batch_size: int = 32, num_epochs: int = 3, save_model_path: str = "finetuned_senBERT") -> None:

        """
        Fine-tunes the model using MultipleNegativesRankingLoss.
        (1) Prepare training examples from `self.prepare_training_examples()`
        (2) Experiment with [anchor, positive] vs [anchor, positive, negative]
        (3) Define a loss function (`MultipleNegativesRankingLoss`)
        (4) Freeze all model layers except the final layers
        (5) Train the model with the specified learning rate
        (6) Save the fine-tuned model
        """
        #TODO Put your code here.
        ###########################################################################
        train_examples = self.prepare_training_examples()
        if not train_examples:
            print("Error: No training examples found.")
            return
        
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
        train_loss = losses.MultipleNegativesRankingLoss(model=self.model)
        target_model = self.model[0].auto_model

        print("--- Freezing layers ---")
        for name, param in target_model.named_parameters():
            if any(key in name for key in ["layer.5", "layer.4", "pooler"]):
                param.requires_grad = True
                print(f"Trainable: {name}")
            else:
                param.requires_grad = False

        print(f"Starting training for {num_epochs} epochs...")
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=num_epochs,
            warmup_steps=int(len(train_dataloader) * 0.1),
            output_path=save_model_path,
            optimizer_params={'lr': 2e-5},
            show_progress_bar=True
        )

        print(f"Finish: {save_model_path}")
        ###########################################################################

    # Take a careful look into how the training set is created
    def prepare_training_examples(self) -> list[InputExample]:

        """
        Prepares training examples from the training data.
        # Inputs:
            - None (uses self.train_query_id_to_relevant_doc_ids to create training pairs).

         # Output:
            Output: - list[InputExample]: A list of training samples containing [anchor, positive] or [anchor, positive, negative].
            
        """
        train_examples = []
        all_doc_ids = list(self.document_id_to_text.keys())
        for qid, doc_ids in tqdm(self.train_query_id_to_relevant_doc_ids.items(), desc="Generating Triplets"):
            relevant_set = set(doc_ids)
            for doc_id in doc_ids:
                anchor = self.query_id_to_text[qid]
                positive = self.document_id_to_text[doc_id]
                # TODO: Select random negative examples that are not relevant to the query.
                negative_id = random.choice(all_doc_ids)
                while negative_id in relevant_set:
                    negative_id = random.choice(all_doc_ids)
                
                negative = self.document_id_to_text[negative_id]
                
                # TODO: Create list[InputExample] of type [anchor, positive, negative]
                train_examples.append(InputExample(texts=[anchor, positive, negative]))
                #train_examples.append(InputExample(texts=[anchor, positive]))

        return train_examples


In [9]:
# Initialize the model with the medical dataset (nfcorpus)
model = TextSimilarityModel("BeIR/nfcorpus", "BeIR/nfcorpus-qrels")

# Evaluate using the default Sentence Transformer
print("Ranking with sentence_transformer...")
model.rank_documents(encoding_method='sentence_transformer')
sbert_map = model.mean_average_precision()
print("SBERT Mean Average Precision:", sbert_map)

# Evaluate using GloVe (requires 'glove.6B.50d.txt' in your directory)
print("\nRanking with glove...")
model.rank_documents(encoding_method='glove')
glove_map = model.mean_average_precision()
print("GloVe Mean Average Precision:", glove_map)

# Qualitative test: Show actual document text for a sample query
model.show_ranking_documents("glove","Breast Cancer Cells Feed on Cholesterol")
model.show_ranking_documents("sentence_transformer", "Breast Cancer Cells Feed on Cholesterol")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


CUDA available: True
Current GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Ranking with sentence_transformer...


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

cosine similarity finish


Ranking (sentence_transformer): 100%|████████████████████████████████████████████████| 323/323 [01:31<00:00,  3.51it/s]


Task 2 Finish
SBERT Mean Average Precision: 0.16038328234687443

Ranking with glove...


GloVe encoding: 100%|███████████████████████████████████████████████████████████| 3237/3237 [00:00<00:00, 39056.23it/s]
GloVe encoding: 100%|████████████████████████████████████████████████████████████| 3633/3633 [00:00<00:00, 5658.15it/s]


cosine similarity finish


Ranking (glove): 100%|███████████████████████████████████████████████████████████████| 323/323 [01:50<00:00,  2.92it/s]


Task 2 Finish
GloVe Mean Average Precision: 0.024486486169480685


GloVe encoding: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
GloVe encoding: 100%|████████████████████████████████████████████████████████████| 3633/3633 [00:00<00:00, 4414.04it/s]



--- Top 10 Results for Query (Method: glove) ---
Query: Breast Cancer Cells Feed on Cholesterol

Rank 1 | Score: 0.8676 | ID: MED-2111
Snippet: Coronary artery disease is essentially nonexistent in cultures whose nutrition assures cholesterol levels <150 mg/dl. Patients with advanced coronary ...
------------------------------
Rank 2 | Score: 0.8423 | ID: MED-5160
Snippet: Pine needles (Pinus densiflora Siebold et Zuccarini) have long been used as a traditional health-promoting medicinal food in Korea. To investigate the...
------------------------------
Rank 3 | Score: 0.8418 | ID: MED-3129
Snippet: BRCA1 mutations have been associated with hereditary breast cancer only. Recent studies indicate that a subgroup of sporadic breast cancer might also ...
------------------------------
Rank 4 | Score: 0.8351 | ID: MED-865
Snippet: Prostate cancer remains the second leading cause of cancer deaths among American men. Earlier diagnosis increases survival rate in patients. However, ...
------

In [14]:
model = TextSimilarityModel("BeIR/nfcorpus", "BeIR/nfcorpus-qrels")
print("\nRanking with openai...")
model.rank_documents(encoding_method='openai')
openai_map = model.mean_average_precision()
print("openai Mean Average Precision:", openai_map)

model.show_ranking_documents("openai","Breast Cancer Cells Feed on Cholesterol")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


CUDA available: True
Current GPU: NVIDIA GeForce RTX 4060 Laptop GPU

Ranking with openai...
Getting key from os


OpenAI API requesting: 100%|███████████████████████████████████████████████████████████| 33/33 [00:41<00:00,  1.25s/it]


Getting key from os


OpenAI API requesting: 100%|███████████████████████████████████████████████████████████| 36/36 [00:48<00:00,  1.36s/it]


cosine similarity finish


Ranking (openai): 100%|██████████████████████████████████████████████████████████████| 323/323 [01:45<00:00,  3.05it/s]


Task 2 Finish
openai Mean Average Precision: 0.20002033024968435
Getting key from os
Getting key from os

--- Top 10 Results for Query (Method: openai) ---
Query: Breast Cancer Cells Feed on Cholesterol

Rank 1 | Score: 0.5738 | ID: MED-2434
Snippet: The specific role of dietary fat in breast cancer progression is unclear, although a low-fat diet was associated with decreased recurrence of estrogen...
------------------------------
Rank 2 | Score: 0.5047 | ID: MED-2439
Snippet: While many factors are involved in the etiology of cancer, it has been clearly established that diet significantly impacts one’s risk for this disease...
------------------------------
Rank 3 | Score: 0.4960 | ID: MED-2427
Snippet: Lipid rafts/caveolae are membrane platforms for signaling molecules that regulate various cellular functions, including cell survival. To better under...
------------------------------
Rank 4 | Score: 0.4907 | ID: MED-3551
Snippet: Breast cancer is the leading cause of cancer-related 

In [10]:
# Finetune all-MiniLM-L6-v2 sentence transformer model
model = TextSimilarityModel("BeIR/nfcorpus", "BeIR/nfcorpus-qrels")
model.fine_tune_model(batch_size=3, num_epochs=2, save_model_path="finetuned_senBERT_train_v2")  # Adjust batch size and epochs as needed

model.rank_documents()
map_score = model.mean_average_precision()
print("Mean Average Precision:", map_score)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


CUDA available: True
Current GPU: NVIDIA GeForce RTX 4060 Laptop GPU


Generating Triplets: 100%|███████████████████████████████████████████████████████| 2590/2590 [00:00<00:00, 4505.25it/s]


--- Freezing layers ---
Trainable: encoder.layer.4.attention.self.query.weight
Trainable: encoder.layer.4.attention.self.query.bias
Trainable: encoder.layer.4.attention.self.key.weight
Trainable: encoder.layer.4.attention.self.key.bias
Trainable: encoder.layer.4.attention.self.value.weight
Trainable: encoder.layer.4.attention.self.value.bias
Trainable: encoder.layer.4.attention.output.dense.weight
Trainable: encoder.layer.4.attention.output.dense.bias
Trainable: encoder.layer.4.attention.output.LayerNorm.weight
Trainable: encoder.layer.4.attention.output.LayerNorm.bias
Trainable: encoder.layer.4.intermediate.dense.weight
Trainable: encoder.layer.4.intermediate.dense.bias
Trainable: encoder.layer.4.output.dense.weight
Trainable: encoder.layer.4.output.dense.bias
Trainable: encoder.layer.4.output.LayerNorm.weight
Trainable: encoder.layer.4.output.LayerNorm.bias
Trainable: encoder.layer.5.attention.self.query.weight
Trainable: encoder.layer.5.attention.self.query.bias
Trainable: encoder.l

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,1.878652
1000,1.806926
1500,1.630516
2000,1.539556
2500,1.530074
3000,1.472565
3500,1.488967
4000,1.47471
4500,1.436949
5000,1.452277


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Finish: finetuned_senBERT_train_v2


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

cosine similarity finish


Ranking (sentence_transformer): 100%|████████████████████████████████████████████████| 323/323 [01:50<00:00,  2.92it/s]


Task 2 Finish
Mean Average Precision: 0.20584097819372285


In [13]:
class TextSimilarityModel_fine_tune_positive:
    def __init__(self, corpus_name, rel_name, model_name='all-MiniLM-L6-v2', top_k=10):
        """
        Initialize the model with datasets and pre-trained sentence transformer.
        """
        load_dotenv()
        self.model = SentenceTransformer(model_name)

        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"Current GPU: {torch.cuda.get_device_name(0)}")
            self.model.to('cuda')
        else:
            print("Using CPU")
        
        self.corpus_name = corpus_name
        self.rel_name = rel_name
        self.top_k = top_k
        self.session_state = {}
        self.load_data()


    def load_data(self):
        """
        Load and filter datasets based on test queries and documents.
        """
        # Load query and document datasets
        dataset_queries = load_dataset(self.corpus_name, "queries")
        dataset_docs = load_dataset(self.corpus_name, "corpus")

        # Extract queries and documents
        self.queries = dataset_queries["queries"]["text"]
        self.query_ids = dataset_queries["queries"]["_id"]
        self.documents = dataset_docs["corpus"]["text"]
        self.document_ids = dataset_docs["corpus"]["_id"]

                
        # Filter queries and documents and build relevant queries and documents mapping based on test set
        test_qrels = load_dataset(self.rel_name)["test"]
        self.filtered_test_query_ids = set(test_qrels["query-id"])
        self.filtered_test_doc_ids = set(test_qrels["corpus-id"])

        self.test_queries = [q for qid, q in zip(self.query_ids, self.queries) if qid in self.filtered_test_query_ids]
        self.test_query_ids = [qid for qid in self.query_ids if qid in self.filtered_test_query_ids]
        self.test_documents = [doc for did, doc in zip(self.document_ids, self.documents) if did in self.filtered_test_doc_ids]
        self.test_document_ids = [did for did in self.document_ids if did in self.filtered_test_doc_ids]

        self.test_query_id_to_relevant_doc_ids = {qid: [] for qid in self.test_query_ids}
        for qid, doc_id in zip(test_qrels["query-id"], test_qrels["corpus-id"]):
            if qid in self.test_query_id_to_relevant_doc_ids:
                self.test_query_id_to_relevant_doc_ids[qid].append(doc_id)
                
        ## Code Below this is used for creating the training set 
        # Build query and document id to text mapping
        self.query_id_to_text = {query_id:query for query_id, query in zip(self.query_ids, self.queries)}
        self.document_id_to_text = {document_id:document for document_id, document in zip(self.document_ids, self.documents)}

        # Build relevant queries and documents mapping based on train set
        train_qrels = load_dataset(self.rel_name)["train"]
        self.train_query_id_to_relevant_doc_ids = {qid: [] for qid in train_qrels["query-id"]}

        for qid, doc_id in zip(train_qrels["query-id"], train_qrels["corpus-id"]):
            if qid in self.train_query_id_to_relevant_doc_ids:
                # Append the document ID to the relevant doc mapping
                self.train_query_id_to_relevant_doc_ids[qid].append(doc_id)
        
        # Filter queries and documents and build relevant queries and documents mapping based on validation set  
        #TODO Put your code here. 
         ###########################################################################
        val_qrels = load_dataset(self.rel_name)["validation"]
        self.filtered_val_query_ids = set(val_qrels["query-id"])
        self.filtered_val_doc_ids = set(val_qrels["corpus-id"])        

        self.val_queries = [q for qid, q in zip(self.query_ids, self.queries) if qid in self.filtered_val_query_ids]
        self.val_query_ids = [qid for qid in self.query_ids if qid in self.filtered_val_query_ids]
        self.val_documents = [doc for did, doc in zip(self.document_ids, self.documents) if did in self.filtered_val_doc_ids]
        self.val_document_ids = [did for did in self.document_ids if did in self.filtered_val_doc_ids]

        self.val_query_id_to_relevant_doc_ids = {qid: [] for qid in self.val_query_ids}
        for qid, doc_id in zip(val_qrels["query-id"], val_qrels["corpus-id"]):
            if qid in self.val_query_id_to_relevant_doc_ids:
                self.val_query_id_to_relevant_doc_ids[qid].append(doc_id)
        ###########################################################################
        

    #Task 1: Encode Queries and Documents (10 Pts)

    def encode_with_glove(self, glove_file_path: str, sentences: list[str]) -> list[np.ndarray]:

        """
        # Inputs:
            - glove_file_path (str): Path to the GloVe embeddings file (e.g., "glove.6B.50d.txt").
            - sentences (list[str]): A list of sentences to encode.

        # Output:
            - list[np.ndarray]: A list of sentence embeddings 
            
        (1) Encodes sentences by averaging GloVe 50d vectors of words in each sentence.
        (2) Return a sequence of embeddings of the sentences.
        Download the glove vectors from here. 
        https://nlp.stanford.edu/data/glove.6B.zip
        Handle unknown words by using zero vectors
        """
        #TODO Put your code here. 
        ###########################################################################
        word_to_vec = {}
        embedding_dim = 50
    
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                word_to_vec[word] = vector  
        
        sentence_embeddings = []
        for sentence in tqdm(sentences, desc="GloVe encoding"):
            words = sentence.lower().split()
            
            vectors = []
            for word in words:
                if word in word_to_vec:
                    vectors.append(word_to_vec[word])
                else:
                    vectors.append(np.zeros(embedding_dim))
            
            if len(vectors) > 0:
                avg_vec = np.mean(vectors, axis=0)
            else:
                avg_vec = np.zeros(embedding_dim)
                
            sentence_embeddings.append(avg_vec)

        return sentence_embeddings
        ###########################################################################
    

    def encode_with_openai(
        self,
        sentences: List[str], 
        model: str = 'text-embedding-3-small',
        api_key: Optional[str] = None,
        batch_size: int = 100
    ) -> np.ndarray:
        """
        Encodes sentences using OpenAI's embedding API.
        
        # Inputs:
            - sentences (List[str]): A list of sentences to encode.
            - model (str): OpenAI model name. Options:
                * 'text-embedding-3-small' (1536 dims, $0.02/1M tokens) - RECOMMENDED
                * 'text-embedding-3-large' (3072 dims, $0.13/1M tokens)
                * 'text-embedding-ada-002' (1536 dims, legacy)
            - api_key (str, optional): OpenAI API key. If None, reads from OPENAI_API_KEY env variable
            - batch_size (int): Number of sentences to encode per API call (max 2048)
            
        Instructions:
        - Implement batched encoding with error handling
        - Add rate limiting (sleep between batches)
        
        Expected Cost for this Assignment:
        - ~4,000 texts (320 queries + 3,600 documents)
        - text-embedding-3-small: ~$0.08-0.10 per student
        - text-embedding-3-large: ~$0.50-0.65 per student
        
        Tips:
        - Use try-except for API errors
        - Implement retry logic with exponential backoff
        - Cache embeddings to avoid re-encoding
        - Monitor your usage at: https://platform.openai.com/usage
        """
        #TODO Put your code here.
        ###########################################################################
        cache_key = "cat_embed_" + "openai" + "_" + model
        if cache_key not in self.session_state:
            self.session_state[cache_key] = {}
        if api_key is None:
            print("Getting key from os")
            api_key = os.getenv("OPENAI_API_KEY")

        input = list(set(s for s in sentences if s not in self.session_state[cache_key]))
        openai_embedding = []

        if not api_key:
            raise Exception("openai api key not exist")
        
        if input:
            client = OpenAI(api_key = api_key)
            try:
                pbar = tqdm(range(0, len(input), batch_size), desc="OpenAI API requesting")
                for i in pbar:
                    batch = input[i : i + batch_size]
                    response = client.embeddings.create(
                        input = batch,
                        model = model
                    )
                    results = [data.embedding for data in response.data]
                    
                    for text, embedding_result in zip(batch, results):
                        self.session_state[cache_key][text] = embedding_result
                    time.sleep(0.5)
            except Exception as e:
                print(f"Error getting OpenAI embeddings: {e}")
                
        dim = 3072 if model == 'text-embedding-3-large' else 1536
        
        for s in sentences:
            if s in self.session_state[cache_key]:
                openai_embedding.append(self.session_state[cache_key][s])
            else:
                openai_embedding.append(np.zeros(dim))
                
        return np.array(openai_embedding)
        ###########################################################################

    #Task 2: Calculate Cosine Similarity and Rank Documents (20 Pts)
    
    def rank_documents(self, encoding_method: str = 'sentence_transformer') -> None:
        """
         # Inputs:
            - encoding_method (str): The method used for encoding queries/documents. 
                             Options: ['glove', 'sentence_transformer'].

        # Output:
            - None (updates self.query_id_to_ranked_doc_ids with ranked document IDs).
    
        (1) Compute cosine similarity between each document and the query
        (2) Rank documents for each query and save the results in a dictionary "query_id_to_ranked_doc_ids" 
            This will be used in "mean_average_precision"
            Example format {2: [125, 673], 35: [900, 822]}
        """
        if encoding_method == 'glove':
            # Note: Ensure "glove.6B.50d.txt" is downloaded and in the local directory
            query_embeddings = self.encode_with_glove("glove.6B.50d.txt", self.queries)
            document_embeddings = self.encode_with_glove("glove.6B.50d.txt", self.documents)
        elif encoding_method == 'sentence_transformer':
            query_embeddings = self.model.encode(self.queries, show_progress_bar=True)
            document_embeddings = self.model.encode(self.documents, show_progress_bar=True)
        elif encoding_method == 'openai':
            # Use environment variable or prompt for API key
            query_embeddings = self.encode_with_openai(self.queries)
            document_embeddings = self.encode_with_openai(self.documents)
        else:
            raise ValueError("Invalid encoding method. Choose 'glove' or 'sentence_transformer'.")
        
        
        #TODO Put your code here.
        ###########################################################################
         # define a dictionary to store the ranked documents for each query
        self.query_id_to_ranked_doc_ids = {}
        similarities = cosine_similarity(query_embeddings, document_embeddings)
        print("cosine similarity finish")

        query_id_to_idx = {query_id: idx for idx, query_id in enumerate(self.query_ids)}
        for query_id in tqdm(self.test_query_ids, desc=f"Ranking ({encoding_method})"):
            if query_id in query_id_to_idx:
                # i query score to all documents
                row_idx = query_id_to_idx[query_id]
                query_scores = similarities[row_idx]
                
                # argsort sort ascending，[::-1] to revert
                ranked_indices = np.argsort(query_scores)[::-1]
                
                # track document ID
                ranked_doc_ids = [self.document_ids[idx] for idx in ranked_indices]
                
                # save
                self.query_id_to_ranked_doc_ids[query_id] = ranked_doc_ids

        print("Task 2 Finish")
      
        ###########################################################################

    @staticmethod
    def average_precision(relevant_docs: list[str], candidate_docs: list[str]) -> float:
        """
        # Inputs:
            - relevant_docs (list[str]): A list of document IDs that are relevant to the query.
            - candidate_docs (list[str]): A list of document IDs ranked by the model.

        # Output:
            - float: The average precision score
    
        Compute average precision for a single query.
        """
        y_true = [1 if doc_id in relevant_docs else 0 for doc_id in candidate_docs]
        precisions = [np.mean(y_true[:k+1]) for k in range(len(y_true)) if y_true[k]]
        return np.mean(precisions) if precisions else 0

    #Task 3: Calculate Evaluate System Performance (10 Pts)
    
    def mean_average_precision(self) -> float:
        """
        # Inputs:
            - None (uses ranked documents stored in self.query_id_to_ranked_doc_ids).

        # Output:
            - float: The MAP score, computed as the mean of all average precision scores.
    
        (1) Compute mean average precision for all queries using the "average_precision" function.
        (2) Compute the mean of all average precision scores
        Return the mean average precision score
        
        reference: https://www.evidentlyai.com/ranking-metrics/mean-average-precision-map
        https://towardsdatascience.com/map-mean-average-precision-might-confuse-you-5956f1bfa9e2
        """
         #TODO Put your code here. 
        ###########################################################################
        ap_scores = []
        
        for query_id in self.test_query_ids:
            ranked_docs = self.query_id_to_ranked_doc_ids.get(query_id, [])
            
            relevant_docs = self.test_query_id_to_relevant_doc_ids.get(query_id, [])
            
            ap = self.average_precision(relevant_docs, ranked_docs)
            ap_scores.append(ap)

        if not ap_scores:
            return 0.0
            
        return sum(ap_scores) / len(ap_scores)
        ###########################################################################
    
    #Task 4: Ranking the Top 10 Documents based on Similarity Scores (10 Pts)

    def show_ranking_documents(self, encoding_method: str, example_query: str) -> None:
                
        """
        # Inputs:
            - example_query (str): A query string for which top-ranked documents should be displayed.

        # Output:
            - None (prints the ranked documents along with similarity scores).
        
        (1) rank documents with given query with cosine similarity scores
        (2) prints the top 10 results along with its similarity score.
        
        """
        #TODO Put your code here. 

        ###########################################################################
      
        # 1. Encode the single query based on the method
        # 2. Reshape check: Ensure query_embedding is (1, n_features)
        # 3. Calculate scores
        
        if encoding_method == 'glove':
            query_embedding = self.encode_with_glove("glove.6B.50d.txt", [example_query])
            doc_embs = self.encode_with_glove("glove.6B.50d.txt", self.documents)
        elif encoding_method == 'sentence_transformer':
            query_embedding = self.model.encode([example_query])
            doc_embs = self.model.encode(self.documents)
        elif encoding_method == 'openai':
            query_embedding = self.encode_with_openai([example_query])
            doc_embs = self.encode_with_openai(self.documents)
        else:
            print("Invalid encoding method.")
            return

        query_embedding = np.array(query_embedding).reshape(1, -1)            

        scores = cosine_similarity(query_embedding, doc_embs)[0]
        top_10_indices = np.argsort(scores)[::-1][:10]

        print(f"\n--- Top 10 Results for Query (Method: {encoding_method}) ---")
        print(f"Query: {example_query}\n")

        for i, idx in enumerate(top_10_indices):
            doc_id = self.document_ids[idx]
            score = scores[idx]
            text_snippet = self.document_id_to_text.get(doc_id, "No text available")[:150]
            
            print(f"Rank {i+1} | Score: {score:.4f} | ID: {doc_id}")
            print(f"Snippet: {text_snippet}...")
            print("-" * 30)
        ###########################################################################
      
    #Task 5:Fine tune the sentence transformer model (25 Pts)
    # Students are not graded on achieving a high MAP score. 
    # The key is to show understanding, experimentation, and thoughtful analysis.
    
    def fine_tune_model(self, batch_size: int = 32, num_epochs: int = 3, save_model_path: str = "finetuned_senBERT") -> None:

        """
        Fine-tunes the model using MultipleNegativesRankingLoss.
        (1) Prepare training examples from `self.prepare_training_examples()`
        (2) Experiment with [anchor, positive] vs [anchor, positive, negative]
        (3) Define a loss function (`MultipleNegativesRankingLoss`)
        (4) Freeze all model layers except the final layers
        (5) Train the model with the specified learning rate
        (6) Save the fine-tuned model
        """
        #TODO Put your code here.
        ###########################################################################
        train_examples = self.prepare_training_examples()
        if not train_examples:
            print("Error: No training examples found.")
            return
        
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
        train_loss = losses.MultipleNegativesRankingLoss(model=self.model)
        target_model = self.model[0].auto_model

        print("--- Freezing layers ---")
        for name, param in target_model.named_parameters():
            if any(key in name for key in ["layer.5", "layer.4", "pooler"]):
                param.requires_grad = True
                print(f"Trainable: {name}")
            else:
                param.requires_grad = False

        print(f"Starting training for {num_epochs} epochs...")
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=num_epochs,
            warmup_steps=int(len(train_dataloader) * 0.1),
            output_path=save_model_path,
            optimizer_params={'lr': 2e-5},
            show_progress_bar=True
        )

        print(f"Finish: {save_model_path}")
        ###########################################################################

    # Take a careful look into how the training set is created
    def prepare_training_examples(self) -> list[InputExample]:

        """
        Prepares training examples from the training data.
        # Inputs:
            - None (uses self.train_query_id_to_relevant_doc_ids to create training pairs).

         # Output:
            Output: - list[InputExample]: A list of training samples containing [anchor, positive] or [anchor, positive, negative].
            
        """
        train_examples = []
        all_doc_ids = list(self.document_id_to_text.keys())
        for qid, doc_ids in tqdm(self.train_query_id_to_relevant_doc_ids.items(), desc="Generating Triplets"):
            relevant_set = set(doc_ids)
            for doc_id in doc_ids:
                anchor = self.query_id_to_text[qid]
                positive = self.document_id_to_text[doc_id]
                # TODO: Select random negative examples that are not relevant to the query.
                # negative_id = random.choice(all_doc_ids)
                # while negative_id in relevant_set:
                #     negative_id = random.choice(all_doc_ids)
                
                # negative = self.document_id_to_text[negative_id]
                
                # TODO: Create list[InputExample] of type [anchor, positive, negative]
                #train_examples.append(InputExample(texts=[anchor, positive, negative]))
                train_examples.append(InputExample(texts=[anchor, positive]))

        return train_examples


In [14]:
# Finetune all-MiniLM-L6-v2 sentence transformer model
model_fine_tune_positive = TextSimilarityModel_fine_tune_positive("BeIR/nfcorpus", "BeIR/nfcorpus-qrels")
model_fine_tune_positive.fine_tune_model(batch_size=3, num_epochs=2, save_model_path="finetuned_senBERT_train_v2")  # Adjust batch size and epochs as needed

model_fine_tune_positive.rank_documents()
map_score = model_fine_tune_positive.mean_average_precision()
print("Mean Average Precision:", map_score)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


CUDA available: True
Current GPU: NVIDIA GeForce RTX 4060 Laptop GPU


Generating Triplets: 100%|███████████████████████████████████████████████████████| 2590/2590 [00:00<00:00, 3895.35it/s]


--- Freezing layers ---
Trainable: encoder.layer.4.attention.self.query.weight
Trainable: encoder.layer.4.attention.self.query.bias
Trainable: encoder.layer.4.attention.self.key.weight
Trainable: encoder.layer.4.attention.self.key.bias
Trainable: encoder.layer.4.attention.self.value.weight
Trainable: encoder.layer.4.attention.self.value.bias
Trainable: encoder.layer.4.attention.output.dense.weight
Trainable: encoder.layer.4.attention.output.dense.bias
Trainable: encoder.layer.4.attention.output.LayerNorm.weight
Trainable: encoder.layer.4.attention.output.LayerNorm.bias
Trainable: encoder.layer.4.intermediate.dense.weight
Trainable: encoder.layer.4.intermediate.dense.bias
Trainable: encoder.layer.4.output.dense.weight
Trainable: encoder.layer.4.output.dense.bias
Trainable: encoder.layer.4.output.LayerNorm.weight
Trainable: encoder.layer.4.output.LayerNorm.bias
Trainable: encoder.layer.5.attention.self.query.weight
Trainable: encoder.layer.5.attention.self.query.bias
Trainable: encoder.l

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,1.228467
1000,1.195608
1500,1.025543
2000,0.975159
2500,0.946662
3000,0.917097
3500,0.933878
4000,0.916814
4500,0.863546
5000,0.88733


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Finish: finetuned_senBERT_train_v2


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

cosine similarity finish


Ranking (sentence_transformer): 100%|████████████████████████████████████████████████| 323/323 [01:44<00:00,  3.08it/s]


Task 2 Finish
Mean Average Precision: 0.19238795085269614
