In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
import joblib
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import torch.nn.functional as F

# === Configuration ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
os.environ["WANDB_DISABLED"] = "true"

# === Load and process data ===
def load_data(filepath="dataset.csv"):
    """Load and process the disease-symptom dataset"""
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        # Fallback paths
        alternative_paths = [
            "/kaggle/input/disease-symptom-description-dataset/dataset.csv",
            "disease_symptom_dataset.csv"
        ]

        for path in alternative_paths:
            try:
                df = pd.read_csv(path)
                break
            except FileNotFoundError:
                continue
        else:
            raise FileNotFoundError(f"Could not find dataset at {filepath} or alternative locations")

    print(f"Dataset loaded with shape: {df.shape}")
    return df

def prepare_training_data(df):
    """Extract disease-symptom pairs from dataframe"""
    symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]

    # Create both disease→symptom and symptom→disease pairs
    pairs = []

    # Extract unique diseases and symptoms for encoding
    unique_diseases = []
    all_symptoms = []

    # Create pairs and collect unique values
    for _, row in df.iterrows():
        disease = str(row["Disease"]).strip()
        symptoms = [str(row[col]).strip() for col in symptom_cols
                    if pd.notna(row[col]) and str(row[col]).strip().lower() != "nan"]

        if disease and symptoms:
            unique_diseases.append(disease)
            all_symptoms.extend(symptoms)

            # Disease→Symptom pairs
            for symptom in symptoms:
                pairs.append((disease, symptom, "disease_to_symptom"))

            # Symptom→Disease pairs (combine all symptoms)
            symptom_text = ", ".join(symptoms)
            pairs.append((symptom_text, disease, "symptoms_to_disease"))

            # Also add individual symptoms to disease pairs for more granular training
            for symptom in symptoms:
                pairs.append((symptom, disease, "symptom_to_disease"))

    # Create encoders
    disease_encoder = LabelEncoder()
    disease_encoder.fit(list(set(unique_diseases)))

    symptom_encoder = LabelEncoder()
    symptom_encoder.fit(list(set(all_symptoms)))

    print(f"Found {len(pairs)} training pairs")
    print(f"Unique diseases: {len(disease_encoder.classes_)}")
    print(f"Unique symptoms: {len(symptom_encoder.classes_)}")

    return pairs, disease_encoder, symptom_encoder

# Split train/test
def split_data(pairs):
    """Split data into train/test sets"""
    train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)
    return train_pairs, test_pairs

# === Embedding Model ===
class JointEmbeddingModel(torch.nn.Module):
    """Joint embedding model for diseases and symptoms"""
    def __init__(self, model_name="dmis-lab/biobert-v1.1", embedding_dim=768):
        super(JointEmbeddingModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.embedding_dim = embedding_dim

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Use [CLS] token for sentence embedding
        embeddings = outputs.last_hidden_state[:, 0]
        # Normalize embeddings
        return F.normalize(embeddings, p=2, dim=1)

# === Dataset ===
class ContrastiveDataset(Dataset):
    """Dataset for contrastive learning with positive/negative pairs"""
    def __init__(self, pairs, tokenizer, max_length=128):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        text1, text2, pair_type = self.pairs[idx]

        # Tokenize both texts
        encoding1 = self.tokenizer(
            text1,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        encoding2 = self.tokenizer(
            text2,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Create sample with both texts and their encodings
        sample = {
            "text1_input_ids": encoding1["input_ids"].squeeze(0),
            "text1_attention_mask": encoding1["attention_mask"].squeeze(0),
            "text2_input_ids": encoding2["input_ids"].squeeze(0),
            "text2_attention_mask": encoding2["attention_mask"].squeeze(0),
            "pair_type": pair_type
        }

        return sample

# === Training Functions ===
def train_embedding_model(train_pairs, val_pairs, model_save_path="./joint_embedding_model"):
    """Train the embedding model using contrastive learning"""
    # Initialize model and tokenizer
    model_name = "dmis-lab/biobert-v1.1"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    embedding_model = JointEmbeddingModel(model_name).to(device)

    # Prepare datasets
    train_dataset = ContrastiveDataset(train_pairs, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    # Optimizer
    optimizer = torch.optim.AdamW(embedding_model.parameters(), lr=2e-5)

    # Training loop
    num_epochs = 3
    print(f"Training embedding model for {num_epochs} epochs...")

    for epoch in range(num_epochs):
        embedding_model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer.zero_grad()

            # Move batch to device
            text1_input_ids = batch["text1_input_ids"].to(device)
            text1_attention_mask = batch["text1_attention_mask"].to(device)
            text2_input_ids = batch["text2_input_ids"].to(device)
            text2_attention_mask = batch["text2_attention_mask"].to(device)

            # Get embeddings
            text1_embeddings = embedding_model(text1_input_ids, text1_attention_mask)
            text2_embeddings = embedding_model(text2_input_ids, text2_attention_mask)

            # Compute similarity - dot product of normalized vectors = cosine similarity
            similarity = torch.matmul(text1_embeddings, text2_embeddings.T)

            # Create targets - diagonal elements should be 1 (positive pairs)
            targets = torch.eye(similarity.shape[0], device=device)

            # Contrastive loss - push positive pairs together, negative pairs apart
            loss = F.binary_cross_entropy_with_logits(similarity, targets)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Save model and tokenizer
    os.makedirs(model_save_path, exist_ok=True)
    torch.save(embedding_model.state_dict(), os.path.join(model_save_path, "model.pt"))
    tokenizer.save_pretrained(model_save_path)

    return embedding_model, tokenizer

# === Embedding Generation ===
def generate_embeddings(model, tokenizer, texts):
    """Generate embeddings for a list of texts"""
    model.eval()
    embeddings = []
    batch_size = 16

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]

            encoded = tokenizer(
                batch_texts,
                padding="max_length",
                truncation=True,
                max_length=128,
                return_tensors="pt"
            ).to(device)

            batch_embeddings = model(
                encoded["input_ids"],
                encoded["attention_mask"]
            )

            embeddings.append(batch_embeddings.cpu().numpy())

    return np.vstack(embeddings)

# === Game Logic ===
class MedicalEmbeddingGame:
    """Game class for the medical embedding game"""
    def __init__(self, model_path="./joint_embedding_model"):
        self.tokenizer = BertTokenizer.from_pretrained(model_path)

        # Load model
        self.model = JointEmbeddingModel().to(device)
        self.model.load_state_dict(torch.load(os.path.join(model_path, "model.pt")))
        self.model.eval()

        # Load disease and symptom data
        self.disease_embeddings = None
        self.disease_names = None
        self.symptom_embeddings = None
        self.symptom_names = None
        self.disease_to_symptoms = None

        # Initialize nearest neighbor models
        self.disease_nn = None
        self.symptom_nn = None

    def load_embeddings(self, embeddings_path="./embeddings"):
        """Load precomputed embeddings"""
        self.disease_embeddings = np.load(os.path.join(embeddings_path, "disease_embeddings.npy"))
        self.symptom_embeddings = np.load(os.path.join(embeddings_path, "symptom_embeddings.npy"))

        # Load names
        self.disease_names = joblib.load(os.path.join(embeddings_path, "disease_names.pkl"))
        self.symptom_names = joblib.load(os.path.join(embeddings_path, "symptom_names.pkl"))

        # Load disease to symptoms mapping
        self.disease_to_symptoms = joblib.load(os.path.join(embeddings_path, "disease_to_symptoms.pkl"))

        # Initialize nearest neighbor models
        self.disease_nn = NearestNeighbors(n_neighbors=5, metric="cosine")
        self.disease_nn.fit(self.disease_embeddings)

        self.symptom_nn = NearestNeighbors(n_neighbors=5, metric="cosine")
        self.symptom_nn.fit(self.symptom_embeddings)

        print(f"Loaded {len(self.disease_names)} diseases and {len(self.symptom_names)} symptoms")

    def find_similar_diseases(self, embedding, top_k=5):
        """Find similar diseases given an embedding"""
        distances, indices = self.disease_nn.kneighbors([embedding], n_neighbors=top_k)
        return [(self.disease_names[idx], 1-dist) for dist, idx in zip(distances[0], indices[0])]

    def find_similar_symptoms(self, embedding, top_k=5):
        """Find similar symptoms given an embedding"""
        distances, indices = self.symptom_nn.kneighbors([embedding], n_neighbors=top_k)
        return [(self.symptom_names[idx], 1-dist) for dist, idx in zip(distances[0], indices[0])]

    def get_disease_embedding(self, disease_name):
        """Get embedding for a disease by name"""
        if disease_name in self.disease_names:
            idx = self.disease_names.index(disease_name)
            return self.disease_embeddings[idx]
        else:
            # Generate embedding for unknown disease
            return self._embed_text(disease_name)

    def get_symptom_embedding(self, symptom_name):
        """Get embedding for a symptom by name"""
        if symptom_name in self.symptom_names:
            idx = self.symptom_names.index(symptom_name)
            return self.symptom_embeddings[idx]
        else:
            # Generate embedding for unknown symptom
            return self._embed_text(symptom_name)

    def _embed_text(self, text):
        """Generate embedding for a text using the model"""
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            embedding = self.model(
                encoded["input_ids"],
                encoded["attention_mask"]
            )

        return embedding.cpu().numpy()[0]

    def merge_symptoms(self, symptom_names):
        """Merge multiple symptom embeddings"""
        embeddings = []
        for symptom in symptom_names:
            embeddings.append(self.get_symptom_embedding(symptom))

        # Simple average of embeddings
        merged_embedding = np.mean(embeddings, axis=0)
        # Normalize the merged embedding
        merged_embedding = merged_embedding / np.linalg.norm(merged_embedding)

        return merged_embedding

    def predict_disease_from_symptoms(self, symptom_names):
        """Predict disease from symptoms"""
        merged_embedding = self.merge_symptoms(symptom_names)
        return self.find_similar_diseases(merged_embedding, top_k=1)[0]

    def calculate_embedding_similarity(self, disease_name, symptom_names):
        """Calculate cosine similarity between disease and merged symptom embeddings"""
        disease_emb = self.get_disease_embedding(disease_name)
        merged_symptom_emb = self.merge_symptoms(symptom_names)

        similarity = 1 - np.dot(disease_emb, merged_symptom_emb)
        return similarity

    def evaluate_hypothesis(self, test_samples=None):
        """Evaluate how well symptom embeddings match disease embeddings"""
        if test_samples is None:
            # Use all known disease-symptom relationships
            test_samples = [(disease, symptoms)
                           for disease, symptoms in self.disease_to_symptoms.items()]

        accuracies = []
        similarities = []

        for disease, symptoms in test_samples:
            # Skip entries with no symptoms
            if not symptoms:
                continue

            # Predict disease from symptoms
            merged_emb = self.merge_symptoms(symptoms)
            predicted_disease, similarity = self.find_similar_diseases(merged_emb, top_k=1)[0]

            # Calculate accuracy (1 if correct, 0 if wrong)
            accuracy = 1 if predicted_disease == disease else 0
            accuracies.append(accuracy)

            # Calculate similarity
            disease_emb = self.get_disease_embedding(disease)
            cosine_sim = 1 - np.linalg.norm(disease_emb - merged_emb)
            similarities.append(cosine_sim)

        results = {
            "accuracy": np.mean(accuracies),
            "avg_similarity": np.mean(similarities),
            "num_samples": len(accuracies)
        }

        return results

# === Main Execution ===
def main():
    # Step 1: Load data
    df = load_data()

    # Step 2: Prepare training data
    pairs, disease_encoder, symptom_encoder = prepare_training_data(df)
    train_pairs, test_pairs = split_data(pairs)

    # Step 3: Train embedding model
    model_save_path = "./joint_embedding_model"
    embedding_model, tokenizer = train_embedding_model(train_pairs, test_pairs, model_save_path)

    # Step 4: Generate and save embeddings
    print("Generating embeddings...")

    # Extract unique diseases and symptoms
    symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]
    disease_names = []
    symptom_names = []
    disease_to_symptoms = {}

    for _, row in df.iterrows():
        disease = str(row["Disease"]).strip()
        symptoms = [str(row[col]).strip() for col in symptom_cols
                    if pd.notna(row[col]) and str(row[col]).strip().lower() != "nan"]

        if disease and disease not in disease_names:
            disease_names.append(disease)
            disease_to_symptoms[disease] = symptoms

        for symptom in symptoms:
            if symptom and symptom not in symptom_names:
                symptom_names.append(symptom)

    # Generate embeddings
    disease_embeddings = generate_embeddings(embedding_model, tokenizer, disease_names)
    symptom_embeddings = generate_embeddings(embedding_model, tokenizer, symptom_names)

    # Save embeddings and metadata
    embeddings_path = "./embeddings"
    os.makedirs(embeddings_path, exist_ok=True)

    np.save(os.path.join(embeddings_path, "disease_embeddings.npy"), disease_embeddings)
    np.save(os.path.join(embeddings_path, "symptom_embeddings.npy"), symptom_embeddings)

    joblib.dump(disease_names, os.path.join(embeddings_path, "disease_names.pkl"))
    joblib.dump(symptom_names, os.path.join(embeddings_path, "symptom_names.pkl"))
    joblib.dump(disease_to_symptoms, os.path.join(embeddings_path, "disease_to_symptoms.pkl"))

    # Step 5: Initialize game and test hypothesis
    print("Testing hypothesis...")
    game = MedicalEmbeddingGame(model_save_path)
    game.load_embeddings(embeddings_path)

    # Test adding symptom embeddings
    results = game.evaluate_hypothesis()
    print(f"Hypothesis results:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Average similarity: {results['avg_similarity']:.4f}")
    print(f"Number of samples: {results['num_samples']}")

    # Step 6: Interactive demo
    print("\nDemo: Predicting disease from symptoms")
    sample_disease = disease_names[0]
    sample_symptoms = disease_to_symptoms[sample_disease]

    print(f"Known disease: {sample_disease}")
    print(f"Known symptoms: {sample_symptoms}")

    # Predict using merged embeddings
    merged_emb = game.merge_symptoms(sample_symptoms)
    predicted_disease, similarity = game.find_similar_diseases(merged_emb, top_k=1)[0]

    print(f"Predicted disease: {predicted_disease}")
    print(f"Similarity score: {similarity:.4f}")

    return game

# === Example Usage ===
def example_predict_disease(game, symptoms):
    """Example function to predict disease from symptoms"""
    disease, similarity = game.predict_disease_from_symptoms(symptoms)
    print(f"Symptoms: {symptoms}")
    print(f"Predicted disease: {disease} (similarity: {similarity:.4f})")
    return disease

def example_analyze_embedding_space(game):
    """Analyze the embedding space by visualizing similarities"""
    # Select a few diseases and their symptoms for visualization
    sample_diseases = list(game.disease_to_symptoms.keys())[:10]

    # Calculate similarities between diseases and their symptoms
    similarities = []
    for disease in sample_diseases:
        symptoms = game.disease_to_symptoms[disease]
        if symptoms:
            sim = game.calculate_embedding_similarity(disease, symptoms)
            similarities.append((disease, sim))

    # Plot similarity scores
    diseases, scores = zip(*similarities)
    plt.figure(figsize=(12, 6))
    plt.bar(diseases, scores)
    plt.xticks(rotation=90)
    plt.title("Disease-Symptom Embedding Similarity")
    plt.ylabel("Cosine Similarity")
    plt.tight_layout()
    plt.savefig("disease_symptom_similarity.png")
    plt.close()

    print(f"Saved similarity analysis to disease_symptom_similarity.png")
    return similarities

if __name__ == "__main__":
    game = main()

    # Additional examples
    print("\nExample: Predicting disease from symptoms")
    example_predict_disease(game, ["fever", "cough", "fatigue"])

    print("\nExample: Analyzing embedding space")
    example_analyze_embedding_space(game)

In [None]:
import zipfile
import os

def zip_folder(folder_path, output_path):
    """Compresses the contents of a folder into a ZIP file.

    Args:
        folder_path (str): The path to the folder you want to compress.
        output_path (str): The desired path for the output ZIP file.
    """
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, relative_path)

# Example usage:
folder_to_compress = '/kaggle/working/data'  # Replace with the actual path to your folder
output_zip_file = '/kaggle/working/data.zip' # Specify the desired output ZIP file name

zip_folder(folder_to_compress, output_zip_file)

print(f"Folder '{folder_to_compress}' has been successfully compressed to '{output_zip_file}'")

In [None]:
import os
import json
import numpy as np
import joblib
from sklearn.neighbors import NearestNeighbors

def convert_embeddings_to_json(embeddings_dir="./kaggle/working/data", output_dir="./frontend/public/data"):
    """
    Convert saved embeddings and metadata to JSON format for use in React

    Args:
        embeddings_dir: Directory containing saved embeddings
        output_dir: Directory to save JSON files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Load embeddings and metadata
    try:
        disease_embeddings = np.load(os.path.join(embeddings_dir, "disease_embeddings.npy"))
        symptom_embeddings = np.load(os.path.join(embeddings_dir, "symptom_embeddings.npy"))

        disease_names = joblib.load(os.path.join(embeddings_dir, "disease_names.pkl"))
        symptom_names = joblib.load(os.path.join(embeddings_dir, "symptom_names.pkl"))
        disease_to_symptoms = joblib.load(os.path.join(embeddings_dir, "disease_to_symptoms.pkl"))

        print(f"Loaded {len(disease_names)} diseases and {len(symptom_names)} symptoms")
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return

    # Build nearest neighbors model for diseases
    disease_nn = NearestNeighbors(n_neighbors=5, metric="cosine")
    disease_nn.fit(disease_embeddings)

    # Create JSON structures

    # 1. Disease data with embeddings
    disease_data = []
    for i, disease in enumerate(disease_names):
        disease_data.append({
            "id": i,
            "name": disease,
            "embedding": disease_embeddings[i].tolist(),
            "symptoms": disease_to_symptoms.get(disease, [])
        })

    # 2. Symptom data with embeddings
    symptom_data = []
    for i, symptom in enumerate(symptom_names):
        symptom_data.append({
            "id": i,
            "name": symptom,
            "embedding": symptom_embeddings[i].tolist()
        })

    # 3. Generate some game levels (randomly select diseases)
    import random

    # Select diseases that have at least 3 symptoms
    eligible_diseases = [d for d in disease_data if len(d["symptoms"]) >= 3]

    # Create 20 game levels or fewer if we don't have enough eligible diseases
    num_levels = min(20, len(eligible_diseases))
    game_levels = []

    selected_disease_indices = random.sample(range(len(eligible_diseases)), num_levels)

    for level, idx in enumerate(selected_disease_indices):
        disease = eligible_diseases[idx]
        game_levels.append({
            "level": level + 1,
            "targetDisease": disease["name"],
            "diseaseId": disease["id"],
            "availableSymptoms": disease["symptoms"],
            # Add some random symptoms as distractors
            "distractorSymptoms": random.sample([s["name"] for s in symptom_data
                                                if s["name"] not in disease["symptoms"]],
                                               min(5, len(symptom_data)))
        })

    # Write to JSON files
    with open(os.path.join(output_dir, "diseases.json"), "w") as f:
        json.dump(disease_data, f)

    with open(os.path.join(output_dir, "symptoms.json"), "w") as f:
        json.dump(symptom_data, f)

    with open(os.path.join(output_dir, "game_levels.json"), "w") as f:
        json.dump(game_levels, f)

    # Create a smaller metadata file without embeddings for faster loading
    disease_metadata = [{"id": d["id"], "name": d["name"], "symptoms": d["symptoms"]}
                        for d in disease_data]

    symptom_metadata = [{"id": s["id"], "name": s["name"]} for s in symptom_data]

    metadata = {
        "diseases": disease_metadata,
        "symptoms": symptom_metadata,
        "levels": game_levels
    }

    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f)

    print(f"Successfully converted embeddings to JSON. Files saved to {output_dir}")
    print(f"Generated {len(game_levels)} game levels")

if __name__ == "__main__":
    convert_embeddings_to_json()

In [None]:
import numpy as np
import joblib
import json
import os

def convert_embeddings_to_json(embeddings_path="/kaggle/input/embeddings/", output_path="./data"):
    """
    Convert saved embeddings and metadata to JSON format for web app
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Load disease and symptom data
    try:
        disease_embeddings = np.load(os.path.join(embeddings_path, "disease_embeddings.npy"))
        symptom_embeddings = np.load(os.path.join(embeddings_path, "symptom_embeddings.npy"))
        disease_names = joblib.load(os.path.join(embeddings_path, "disease_names.pkl"))
        symptom_names = joblib.load(os.path.join(embeddings_path, "symptom_names.pkl"))
        disease_to_symptoms = joblib.load(os.path.join(embeddings_path, "disease_to_symptoms.pkl"))

        print(f"Loaded {len(disease_names)} diseases and {len(symptom_names)} symptoms")
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return False

    # Convert embeddings to lists for JSON serialization
    disease_embeddings_list = disease_embeddings.tolist()
    symptom_embeddings_list = symptom_embeddings.tolist()

    # Create disease data with embeddings
    disease_data = {}
    for i, disease in enumerate(disease_names):
        disease_data[disease] = {
            "embedding": disease_embeddings_list[i],
            "symptoms": disease_to_symptoms.get(disease, [])
        }

    # Create symptom data with embeddings
    symptom_data = {}
    for i, symptom in enumerate(symptom_names):
        symptom_data[symptom] = {
            "embedding": symptom_embeddings_list[i]
        }

    # Save data to JSON files
    try:
        with open(os.path.join(output_path, "diseases.json"), "w") as f:
            json.dump(disease_data, f)

        with open(os.path.join(output_path, "symptoms.json"), "w") as f:
            json.dump(symptom_data, f)

        # Create a simplified symptom list (without embeddings) for the UI
        symptom_list = list(symptom_names)
        with open(os.path.join(output_path, "symptom_list.json"), "w") as f:
            json.dump(symptom_list, f)

        # Create a simplified disease list (without embeddings) for the UI
        disease_list = list(disease_names)
        with open(os.path.join(output_path, "disease_list.json"), "w") as f:
            json.dump(disease_list, f)

        print(f"Successfully saved JSON data to {output_path}")
        return True
    except Exception as e:
        print(f"Error saving JSON: {e}")
        return False

if __name__ == "__main__":
    # Convert embeddings to JSON
    convert_embeddings_to_json()