### For the refence and knowlage used chatgpt(mostly for docs string) and guru99

# Import necessary libraries


In [None]:
"""
Imports required libraries for data processing, visualization, embeddings generation, and similarity calculations.
"""
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sentence_transformers import util
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from wordcloud import WordCloud
import torch
from transformers import AutoTokenizer, AutoModel


# Load BioBERT model


In [None]:
"""
Loads the BioBERT model for generating contextual embeddings from biomedical texts.
The model used is 'dmis-lab/biobert-base-cased-v1.1'.
"""
print("Loading BioBERT model...")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to("cuda")  # Move model to GPU
print("BioBERT loaded successfully!")


# Define function to generate embeddings in batches


In [None]:
"""
Generates embeddings for a list of texts in batches using BioBERT.

Args:
    texts (list): List of input texts to generate embeddings for.
    batch_size (int, optional): Number of texts to process in each batch. Defaults to 16.

Returns:
    torch.Tensor: Tensor of embeddings for the input texts.
"""
# used cpt as well as some research paper for understand
def get_biobert_embeddings_in_batches(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
        with torch.no_grad():
            outputs = biobert_model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu())  # Use CLS token embeddings
        torch.cuda.empty_cache()  # Clear GPU memory
    return torch.cat(embeddings, dim=0)


# Load datasets


In [None]:
"""
Loads preprocessed datasets: MedQA-USMLE, Medical Meadow, and PubMedQA.
Combines all contexts for further processing.
"""
medqa_path = "NLP_Project/Preprocessed/medqa_usmle_preprocessed.csv"
medqa_df = pd.read_csv(medqa_path).head(100)  # Load a subset for testing

medical_meadow_path = "NLP_Project/Preprocessed/medical_meadow_preprocessed.csv"
medical_meadow_df = pd.read_csv(medical_meadow_path)
medical_meadow_contexts = medical_meadow_df['output'].tolist()

pubmedqa_path = "NLP_Project/Preprocessed/pubmedqa_preprocessed.json"
with open(pubmedqa_path, "r") as f:
    pubmedqa_data = json.load(f)
pubmedqa_contexts = [context for entry in pubmedqa_data.values() for context in entry.get('CONTEXTS', [])]

# Combine contexts
combined_contexts = medical_meadow_contexts + pubmedqa_contexts
print("Total contexts:", len(combined_contexts))


# Generate context embeddings


In [None]:
# Cell 5: Generate context embeddings
"""
Generates embeddings for the combined contexts using BioBERT in batches.
"""
print("Generating context embeddings in batches...")
context_embeddings = get_biobert_embeddings_in_batches(combined_contexts, batch_size=16)
print("Context embeddings generated successfully!")


# Define function to retrieve relevant contexts


In [None]:
"""
Retrieves the most relevant context for each option using BioBERT.

Args:
    question (str): The main question text.
    options (dict): Dictionary of options (e.g., {"A": "Option A text", ...}).
    token_limit (int, optional): Maximum number of tokens in retrieved contexts. Defaults to 700.

Returns:
    tuple: A dictionary of retrieved contexts and a dictionary of similarity scores.
"""
def retrieve_contexts_with_biobert(question, options, token_limit=700): # guru99 for cpu error
    retrieved_contexts = {}
    similarity_scores = {}
    for option_key, option_text in options.items():
        query = f"{question} {option_text}"
        inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
        with torch.no_grad():
            query_embedding = biobert_model(**inputs).last_hidden_state[:, 0, :].cpu()

        # Compute similarity
        similarity_scores_batch = torch.nn.functional.cosine_similarity(query_embedding, context_embeddings, dim=1)
        top_index = torch.argmax(similarity_scores_batch).item()
        context = combined_contexts[top_index]
        truncated_context = ' '.join(context.split()[:token_limit])  # Limit to token_limit tokens
        retrieved_contexts[option_key] = truncated_context
        similarity_scores[option_key] = similarity_scores_batch[top_index].item()  # Store the similarity score
        torch.cuda.empty_cache()  # Clear GPU memory
    return retrieved_contexts, similarity_scores


# Process questions and retrieve contexts


In [None]:
"""
Processes each question from the MedQA dataset, retrieves relevant contexts
for each option, and stores the results in a structured format.
"""
results = []
for idx, row in medqa_df.iterrows():
    question = row['question']
    options = eval(row['options'])
    print(f"Processing Question {idx+1}: {question}")
    retrieved_contexts, similarity_scores = retrieve_contexts_with_biobert(question, options)
    # used gpt for proper prinnt with formation
    result_row = {
        "question": question,
        "option_a": options.get("A", ""),
        "context_a": retrieved_contexts.get("A", "No relevant context found."),
        "similarity_a": similarity_scores.get("A", 0),
        "option_b": options.get("B", ""),
        "context_b": retrieved_contexts.get("B", "No relevant context found."),
        "similarity_b": similarity_scores.get("B", 0),
        "option_c": options.get("C", ""),
        "context_c": retrieved_contexts.get("C", "No relevant context found."),
        "similarity_c": similarity_scores.get("C", 0),
        "option_d": options.get("D", ""),
        "context_d": retrieved_contexts.get("D", "No relevant context found."),
        "similarity_d": similarity_scores.get("D", 0),
    }
    results.append(result_row)


# Save results to CSV


In [None]:
"""
Saves the retrieved contexts and similarity scores to a CSV file for further analysis.

Output:
    CSV file containing questions, options, contexts, and similarity scores.
"""
output_path = "NLP_Project/Retrieved/TRY/Biobert1.csv"
pd.DataFrame(results).to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")
