In [3]:
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"   
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"    
import xml.etree.ElementTree as ET
import streamlit as st
import spacy
from sentence_transformers import SentenceTransformer, util

In [None]:
def load_medquad_data_recursive(root_folder: str):
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    questions, answers = [], []
    file_count = 0
    for current_dir, subdirs, files in os.walk(root_folder):
        for filename in files:
            if filename.lower().endswith(".xml"):
                file_count += 1
                filepath = os.path.join(current_dir, filename)
                logger.info(f"Parsing file #{file_count}: {filepath}")
                try:
                    tree = ET.parse(filepath)
                    root = tree.getroot()
                    for qa_pair in root.findall(".//QAPair"):
                        q_elem = qa_pair.find("Question")
                        a_elem = qa_pair.find("Answer")
                        if q_elem is not None and a_elem is not None:
                            q_text = q_elem.text
                            a_text = a_elem.text
                            if q_text and a_text:
                                questions.append(q_text.strip())
                                answers.append(a_text.strip())
                except ET.ParseError:
                    pass
    logger.info(f"Total XML files found: {file_count}")
    logger.info(f"Total questions: {len(questions)}; Total answers: {len(answers)}")
    return questions, answers

def load_spacy_model(model_name: str = "en_core_web_sm"):
    try:
        return spacy.load(model_name)
    except OSError:
        return None

def main():
    st.title("Medical Q&A Chatbot")
    medquad_root_folder = r"C:\Users\Roopesh P\OneDrive\Pictures\Documents\Task 3\MedQuAD-master\MedQuAD-master"  #add your own root folder
    questions, answers = load_medquad_data_recursive(medquad_root_folder)
    nlp = load_spacy_model("en_core_web_sm")
    embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
    import torch
    if answers:
        batch_size = 256  
        all_embeddings = []
        for start_idx in range(0, len(answers), batch_size):
            batch = answers[start_idx:start_idx+batch_size]
            emb = embedding_model.encode(batch, convert_to_tensor=True)
            all_embeddings.append(emb)
        answer_embeddings = torch.cat(all_embeddings, dim=0)
    else:
        answer_embeddings = None
    user_question = st.text_input("Ask a medical question:")
    if user_question:
        if nlp:
            doc = nlp(user_question)
            entities = [(ent.text, ent.label_) for ent in doc.ents]
            if entities:
                st.write("Recognized Entities:")
                for text, label in entities:
                    st.write(f"• {text} ({label})")
        if embedding_model and answer_embeddings is not None:
            query_embedding = embedding_model.encode(user_question, convert_to_tensor=True)
            cos_scores = util.pytorch_cos_sim(query_embedding, answer_embeddings)[0]
            best_idx = int(cos_scores.argmax())
            best_answer = answers[best_idx]
            st.write("Question:")
            st.write(user_question)
            st.write("Answer:")
            st.write(best_answer)
        else:
            st.write("No embeddings found; please check if the dataset is loaded correctly.")


if __name__ == "__main__":
    main()