In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm
2025-05-31 17:05:44.859136: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-31 17:05:45.108112: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748685945.211500    1932 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748685945.246247    1932 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748685945.481291    1932 computation_placer.cc:177] computation placer already r

In [7]:
CSV_SYMPTOMS_FILE_PATH = '../data/symptoms_1.csv'
SYMPTOM_COLUMN_NAME = 'Description'
OUTPUT_CSV_WITH_EMBEDDINGS_PATH = '../data/symptoms_1_with_embeddings.csv'

MODEL_NAME = 'all-MiniLM-L6-v2'
PRECOMPUTED_EMBEDDINGS_FILE = 'predefined_symptom_embeddings.npy'
PREDEFINED_SYMPTOMS_LABELS_FILE = 'predefined_symptoms_labels.txt' 

In [3]:
def load_symptoms_df_from_csv(csv_path):
    try:
        df = pd.read_csv(csv_path)
        print(f"Successfully loaded DataFrame from {csv_path}")
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: The CSV file '{csv_path}' was not found.")
    except Exception as e:
        raise Exception(f"An error occurred while reading the CSV '{csv_path}': {e}")

In [4]:
def setup_symptom_data_and_embeddings():
    try:
        print("--- Starting Symptom Data and Embedding Setup ---")

        symptoms_df = load_symptoms_df_from_csv(CSV_SYMPTOMS_FILE_PATH)
        
        if SYMPTOM_COLUMN_NAME not in symptoms_df.columns:
            raise ValueError(
                f"Column '{SYMPTOM_COLUMN_NAME}' not found in CSV file {CSV_SYMPTOMS_FILE_PATH}. "
                f"Available columns: {symptoms_df.columns.tolist()}"
            )
        symptoms_list = symptoms_df[SYMPTOM_COLUMN_NAME].astype(str).tolist()
        if not symptoms_list:
            raise ValueError(f"No symptoms found in column '{SYMPTOM_COLUMN_NAME}' of {CSV_SYMPTOMS_FILE_PATH}.")
        print(f"Loaded {len(symptoms_list)} symptoms for embedding.")

        print(f"Loading model: {MODEL_NAME}...")
        model = SentenceTransformer(MODEL_NAME)
        print("Model loaded.")

        print(f"Generating embeddings for {len(symptoms_list)} predefined symptoms...")
        symptom_embeddings_np = model.encode(symptoms_list, show_progress_bar=True)

        np.save(PRECOMPUTED_EMBEDDINGS_FILE, symptom_embeddings_np)
        print(f"Predefined symptom embeddings saved to {PRECOMPUTED_EMBEDDINGS_FILE} (for mapper)")

        with open(PREDEFINED_SYMPTOMS_LABELS_FILE, 'w') as f:
            for symptom in symptoms_list:
                f.write(f"{symptom}\n")
        print(f"Predefined symptom labels saved to {PREDEFINED_SYMPTOMS_LABELS_FILE} (for mapper)")

        symptoms_df['embedding'] = [str(embedding.tolist()) for embedding in symptom_embeddings_np]
        
        # embedding_df = pd.DataFrame(symptom_embeddings_np, columns=[f'embed_{i}' for i in range(symptom_embeddings_np.shape[1])])
        # symptoms_df = pd.concat([symptoms_df, embedding_df], axis=1)

        symptoms_df.to_csv(OUTPUT_CSV_WITH_EMBEDDINGS_PATH, index=False)
        print(f"DataFrame with embeddings saved to: {OUTPUT_CSV_WITH_EMBEDDINGS_PATH}")

        print("--- Symptom Data and Embedding Setup Complete ---")

    except Exception as e:
        print(f"Error during symptom data and embedding setup: {e}")

In [5]:
class SymptomMapper:
    def __init__(self, model_name_or_path, embeddings_path, labels_path):
        print(f"Loading model: {model_name_or_path} for the mapper...")
        self.model = SentenceTransformer(model_name_or_path)
        print("Model loaded.")

        if not os.path.exists(embeddings_path) or not os.path.exists(labels_path):
            raise FileNotFoundError(
                f"Embeddings file ({embeddings_path}) or labels file ({labels_path}) not found. "
                "Ensure `setup_symptom_data_and_embeddings()` was run successfully."
            )
        print(f"Loading precomputed symptom embeddings from {embeddings_path}...")
        self.predefined_symptom_embeddings = np.load(embeddings_path)
        print("Precomputed embeddings loaded.")

        print(f"Loading predefined symptom labels from {labels_path}...")
        with open(labels_path, 'r') as f:
            self.predefined_symptoms = [line.strip() for line in f.readlines()]
        print("Predefined symptom labels loaded.")
        print(f"Mapper initialized with {len(self.predefined_symptoms)} predefined symptoms.")

    def map_symptoms(self, user_input_text, top_n=3, threshold=0.5):
        if not user_input_text.strip():
            return []
        user_embedding = self.model.encode([user_input_text])
        similarities = cosine_similarity(
            user_embedding,
            self.predefined_symptom_embeddings
        )[0]
        results = sorted(
            [(self.predefined_symptoms[i], score) for i, score in enumerate(similarities)],
            key=lambda x: x[1],
            reverse=True
        )
        matched_symptoms = [
            (symptom, float(score)) for symptom, score in results if score >= threshold
        ][:top_n]
        return matched_symptoms


In [8]:
setup_symptom_data_and_embeddings()

if os.path.exists(PRECOMPUTED_EMBEDDINGS_FILE) and os.path.exists(PREDEFINED_SYMPTOMS_LABELS_FILE):
    try:
        mapper = SymptomMapper(
            model_name_or_path=MODEL_NAME,
            embeddings_path=PRECOMPUTED_EMBEDDINGS_FILE,
            labels_path=PREDEFINED_SYMPTOMS_LABELS_FILE
        )

        user_query1 = "I have a terrible headache and I'm feeling feverish."
        matches1 = mapper.map_symptoms(user_query1, top_n=3, threshold=0.3)
        print(f"\nFor input: '{user_query1}'")
        if matches1:
            for symptom, score in matches1:
                print(f"- {symptom} (Score: {score:.4f})")
        else:
            print("No significant symptoms mapped.")

        user_query2 = "my stomach feels upset and i might throw up"
        matches2 = mapper.map_symptoms(user_query2, top_n=2, threshold=0.4)
        print(f"\nFor input: '{user_query2}'")
        if matches2:
            for symptom, score in matches2:
                print(f"- {symptom} (Score: {score:.4f})")
        else:
            print("No significant symptoms mapped.")

    except FileNotFoundError as e:
        print(f"Error initializing SymptomMapper: {e}")
        print("Please ensure `setup_symptom_data_and_embeddings()` ran successfully.")
    except Exception as e:
        print(f"An unexpected error occurred during SymptomMapper usage: {e}")
else:
    print("Critical error: Embeddings or labels file missing. Cannot initialize SymptomMapper.")

--- Starting Symptom Data and Embedding Setup ---
Successfully loaded DataFrame from ../data/symptoms_1.csv
Loaded 837 symptoms for embedding.
Loading model: all-MiniLM-L6-v2...
Model loaded.
Generating embeddings for 837 predefined symptoms...


Batches: 100%|██████████| 27/27 [00:01<00:00, 19.83it/s]


Predefined symptom embeddings saved to predefined_symptom_embeddings.npy (for mapper)
Predefined symptom labels saved to predefined_symptoms_labels.txt (for mapper)
DataFrame with embeddings saved to: ../data/symptoms_1_with_embeddings.csv
--- Symptom Data and Embedding Setup Complete ---
Loading model: all-MiniLM-L6-v2 for the mapper...
Model loaded.
Loading precomputed symptom embeddings from predefined_symptom_embeddings.npy...
Precomputed embeddings loaded.
Loading predefined symptom labels from predefined_symptoms_labels.txt...
Predefined symptom labels loaded.
Mapper initialized with 837 predefined symptoms.

For input: 'I have a terrible headache and I'm feeling feverish.'
- Headache behind the eyes (typically during fever) (Score: 0.6642)
- fever (Score: 0.6497)
- Fever with chills and sweating (Score: 0.6205)

For input: 'my stomach feels upset and i might throw up'
- Nausea and vomiting (Score: 0.6961)
- Vomiting (Score: 0.6379)
