In [3]:
import os
import logging
import xml.etree.ElementTree as ET
import streamlit as st
import spacy
from sentence_transformers import SentenceTransformer, util
from langdetect import detect
from googletrans import Translator

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

nlp_models = {
    'en': spacy.load('en_core_web_sm'),
    'es': spacy.load('es_core_news_sm'),
    'fr': spacy.load('fr_core_news_sm'),
    'de': spacy.load('de_core_news_sm')
}

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

translator = Translator()

def load_medquad_data_recursive(root_folder: str):
    questions, answers = [], []
    file_count = 0
    for current_dir, subdirs, files in os.walk(root_folder):
        for filename in files:
            if filename.lower().endswith(".xml"):
                file_count += 1
                filepath = os.path.join(current_dir, filename)
                logger.info(f"Parsing file #{file_count}: {filepath}")
                try:
                    tree = ET.parse(filepath)
                    root = tree.getroot()
                    for qa_pair in root.findall(".//QAPair"):
                        q_elem = qa_pair.find("Question")
                        a_elem = qa_pair.find("Answer")
                        if q_elem is not None and a_elem is not None:
                            q_text = q_elem.text
                            a_text = a_elem.text
                            if q_text and a_text:
                                questions.append(q_text.strip())
                                answers.append(a_text.strip())
                except ET.ParseError as e:
                    logger.warning(f"Failed to parse {filepath}: {e}")
    logger.info(f"Total XML files found: {file_count}")
    logger.info(f"Total questions: {len(questions)}; Total answers: {len(answers)}")
    return questions, answers

def detect_language(text: str) -> str:
    try:
        return detect(text)
    except Exception as e:
        logger.error(f"Language detection failed: {e}")
        return 'en' 

def translate_text(text: str, target_lang: str) -> str:
    try:
        translated = translator.translate(text, dest=target_lang)
        return translated.text
    except Exception as e:
        logger.error(f"Translation failed: {e}")
        return text

def generate_response(user_input: str, questions: list, answers: list) -> str:
    user_embedding = embedding_model.encode(user_input, convert_to_tensor=True)
    corpus_embeddings = embedding_model.encode(questions, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(user_embedding, corpus_embeddings)
    best_match_idx = scores.argmax().item()
    return answers[best_match_idx]

def process_user_input(user_input: str, questions: list, answers: list):
    detected_lang = detect_language(user_input)
    logger.info(f"Detected language: {detected_lang}")
    original_input = user_input
    if detected_lang != 'en':
        user_input = translate_text(user_input, 'en')
    response = generate_response(user_input, questions, answers)
    if detected_lang != 'en':
        response = translate_text(response, detected_lang)

    return response

def main():
    st.title("Multilingual Medical Chatbot")
    st.write("Ask your medical questions in English, Spanish, French, German.")
    root_folder = r"C:\Users\Roopesh P\OneDrive\Pictures\Documents\Task 3\MedQuAD-master\MedQuAD-master"   #add path to your file
    questions, answers = load_medquad_data_recursive(root_folder)

    user_input = st.text_input("Your question:")
    if user_input:
        response = process_user_input(user_input, questions, answers)
        st.write("Response:", response)

if __name__ == "__main__":
    main()
