# Import necessary libraries


In [None]:
"""
Imports all required libraries for data processing, visualization, and natural language processing.
"""
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sentence_transformers import SentenceTransformer, util


# Load the Sentence-BERT model


In [None]:
"""
Loads the Sentence-BERT model for semantic similarity tasks.
The model used is 'paraphrase-mpnet-base-v2'.
"""
print("Loading Sentence-BERT model...")
model = SentenceTransformer('paraphrase-mpnet-base-v2')
print("Model loaded successfully!")


# Load preprocessed datasets


In [None]:
"""
Loads the preprocessed datasets: MedQA-USMLE, Medical Meadow, and PubMedQA.
Combines all contexts for further processing.
"""
medqa_path = "NLP_Project/Preprocessed/medqa_usmle_preprocessed.csv"
medqa_df = pd.read_csv(medqa_path).head(100)  # Load only the first 100 questions for testing

medical_meadow_path = "NLP_Project/Preprocessed/medical_meadow_preprocessed.csv"
medical_meadow_df = pd.read_csv(medical_meadow_path)
medical_meadow_contexts = medical_meadow_df['output'].tolist()

pubmedqa_path = "NLP_Project/Preprocessed/pubmedqa_preprocessed.json"
with open(pubmedqa_path, "r") as f:
    pubmedqa_data = json.load(f)
pubmedqa_contexts = [context for entry in pubmedqa_data.values() for context in entry.get('CONTEXTS', [])]

# Combine all contexts
combined_contexts = medical_meadow_contexts + pubmedqa_contexts
context_embeddings = model.encode(combined_contexts, convert_to_tensor=True)


# Define context retrieval function

In [None]:

"""
Retrieves distinct contexts for each question option based on semantic similarity.

Args:
    question (str): The main question text.
    options (dict): Dictionary of options (e.g., {"A": "Option A text", ...}).
    top_n (int, optional): Number of top contexts to retrieve. Defaults to 1.
    token_limit (int, optional): Maximum number of tokens in retrieved contexts. Defaults to 700.

Returns:
    dict: A dictionary mapping each option to its most relevant context and similarity score.
"""


def retrieve_contexts_with_scores(question, options, top_n=1, token_limit=700):
    retrieved_contexts = {}
    used_indices = set()

    for option_key, option_text in options.items():
        query = f"{question} {option_text} {' '.join(option_text.split() * 2)}"
        query_embedding = model.encode(query, convert_to_tensor=True)

        scores = util.cos_sim(query_embedding, context_embeddings)[0]
        scores = scores.cpu().numpy()

        filtered_indices = [i for i, score in enumerate(scores) if score > 0.4]

        if filtered_indices:                # used guru 99 concept
            sorted_indices = np.argsort(scores)[::-1]
            for idx in sorted_indices:
                if idx in filtered_indices and idx not in used_indices:
                    context = combined_contexts[idx]
                    similarity = scores[idx]
                    truncated_context = ' '.join(context.split()[:token_limit])
                    used_indices.add(idx)
                    retrieved_contexts[option_key] = (truncated_context, similarity)
                    break
        else:
            max_index = np.argmax(scores)
            context = combined_contexts[max_index]
            similarity = scores[max_index]
            truncated_context = ' '.join(context.split()[:token_limit])
            retrieved_contexts[option_key] = (truncated_context + " (Fallback)", similarity)

    return retrieved_contexts


# Process questions and retrieve contexts

In [None]:

"""
Processes each question from the MedQA dataset, retrieves relevant contexts
for each option, and stores the results in a structured format.
"""
results = []

for idx, row in medqa_df.iterrows():
    question = row['question']
    options = eval(row['options'])  # Convert stringified dict to actual dict

    print(f"\nProcessing Question {idx+1}:")
    print(f"Question: {question}")
    print("Options:")
    for option_key, option_text in options.items():
        print(f"  {option_key}: {option_text}")
    # used chat gpt for the proper way formation to store in csv
    retrieved_contexts = retrieve_contexts_with_scores(question, options)
    result_row = {
        "question": question,
        "option_a": options.get("A", ""),
        "context_a": retrieved_contexts.get("A", ("No relevant context found.", 0))[0],
        "similarity_a": retrieved_contexts.get("A", ("", 0))[1],
        "option_b": options.get("B", ""),
        "context_b": retrieved_contexts.get("B", ("No relevant context found.", 0))[0],
        "similarity_b": retrieved_contexts.get("B", ("", 0))[1],
        "option_c": options.get("C", ""),
        "context_c": retrieved_contexts.get("C", ("No relevant context found.", 0))[0],
        "similarity_c": retrieved_contexts.get("C", ("", 0))[1],
        "option_d": options.get("D", ""),
        "context_d": retrieved_contexts.get("D", ("No relevant context found.", 0))[0],
        "similarity_d": retrieved_contexts.get("D", ("", 0))[1],
    }
    results.append(result_row)


# Save results to CSV

In [None]:

"""
Saves the retrieved contexts and similarity scores to a CSV file for further analysis.

Output:
    CSV file containing questions, options, contexts, and similarity scores.
"""
output_path = "NLP_Project/Retrieved/TRY/Sentence_BERT1.csv"
results_df = pd.DataFrame(results)
results_df.to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")
