<a href="https://www.kaggle.com/code/avtnshm/clinical-modernbert-v-biomedicalmb-on-ddxplus-data?scriptVersionId=255160735" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load the train, test, and validate datasets
train_df = pd.read_csv('/kaggle/input/mldataset/ddxplus/train.csv')
test_df = pd.read_csv('/kaggle/input/mldataset/ddxplus/test.csv')
validate_df = pd.read_csv('/kaggle/input/mldataset/ddxplus/validate.csv')

# Display the first few rows of the train dataset
train_df.head()

In [None]:

import json

# Load JSON files
with open('/kaggle/input/mldataset/ddxplus/release_conditions.json', 'r') as f:
    conditions = json.load(f)
with open('/kaggle/input/mldataset/ddxplus/release_evidences.json', 'r') as f:
    evidences = json.load(f)

# Convert JSON to DataFrames for easier inspection (optional)
conditions_df = pd.DataFrame.from_dict(conditions, orient='index')
evidences_df = pd.DataFrame.from_dict(evidences, orient='index')

# Display the first few rows
print("Conditions (Diseases):")
print(conditions_df.head())
print("\nEvidences (Symptoms):")
print(evidences_df.head())

In [None]:
def map_symptom_codes(symptom_dict, evidences_df):
    return [evidences_df.loc[code]["question_en"] for code in symptom_dict.keys() if code in evidences_df.index]

conditions_df["symptom_questions"] = conditions_df["symptoms"].apply(lambda x: map_symptom_codes(x, evidences_df))
conditions_df["antecedent_questions"] = conditions_df["antecedents"].apply(lambda x: map_symptom_codes(x, evidences_df))

In [None]:
print(conditions_df[["condition_name", "symptom_questions", "antecedent_questions"]].head())

In [None]:
# Map PATHOLOGY to condition_name
train_df['disease_name'] = train_df['PATHOLOGY'].map(lambda x: conditions.get(x, {}).get('condition_name', x))

# Display the result
print(train_df[['PATHOLOGY', 'disease_name']].head())

In [None]:
# Delete the symptom_questions and antecedent_questions columns
conditions_df = conditions_df.drop(columns=["symptom_questions", "antecedent_questions"], errors="ignore")

# Verify the columns are removed
print(conditions_df.columns)

In [None]:
def map_symptom_codes(symptom_dict, evidences_df):
    # Return list of English questions for valid symptom codes
    return [evidences_df.loc[code, "question_en"] for code in symptom_dict.keys() if code in evidences_df.index]

# Apply to symptoms and antecedents in conditions_df
conditions_df["symptom_questions"] = conditions_df["symptoms"].apply(lambda x: map_symptom_codes(x, evidences_df))
conditions_df["antecedent_questions"] = conditions_df["antecedents"].apply(lambda x: map_symptom_codes(x, evidences_df))

# Display the result
print(conditions_df[["condition_name", "symptom_questions", "antecedent_questions"]].head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot top 10 diseases
plt.figure(figsize=(10, 6))
train_df['disease_name'].value_counts().head(10).plot(kind='bar', color='#1f77b4')
plt.title('Top 10 Most Frequent Diseases')
plt.xlabel('Disease')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Column names from conditions_df
print("Conditions DataFrame columns:")
print(conditions_df.columns.tolist())

# Column names from evidences_df
print("\nEvidences DataFrame columns:")
print(evidences_df.columns.tolist())


print("Train columns:", train_df.columns.tolist())
print("Validate columns:", validate_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

In [None]:
!pip install transformers -q
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
CMB = AutoModel.from_pretrained('Simonlee711/Clinical_ModernBERT')
tokenizer1 = AutoTokenizer.from_pretrained('Simonlee711/Clinical_ModernBERT')
CMB.__class__

In [None]:
from transformers import AutoModel

BioClinical_ModernBERT = "thomas-sounack/BioClinical-ModernBERT-base"

tokenizer2 = AutoTokenizer.from_pretrained(BioClinical_ModernBERT)
BMB = AutoModel.from_pretrained(BioClinical_ModernBERT)

BMB.__class__

In [None]:
print(train_df['DIFFERENTIAL_DIAGNOSIS'].nunique())
print(test_df['DIFFERENTIAL_DIAGNOSIS'].nunique())
validate_df['DIFFERENTIAL_DIAGNOSIS'].nunique()

In [None]:
!nvidia-smi

In [None]:
# Full final cell: batched embedding for full dataset, per-batch save, and timing
import os
import time
import numpy as np
from tqdm import tqdm
import torch

# ---------------- Configuration ----------------
BATCH_SIZE = 64
OUT_DIR = "/kaggle/working/embeddings"              # final files and chunks will be saved here
os.makedirs(OUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ---------------- Helper: build text for each row ----------------
def get_symptom_text(evidence_codes, evidences_df):
    # safe guard if NaN or empty
    if not isinstance(evidence_codes, str) or evidence_codes.strip() == "":
        return ""
    parts = []
    for code in evidence_codes.split(';'):
        if code in evidences_df.index:
            parts.append(evidences_df.loc[code, "question_en"])
    return " ".join(parts)

def build_text_list(train_df, validate_df, test_df, evidences_df):
    # concatenate dataframes in the same order you want embeddings saved
    dfs = [train_df, validate_df, test_df]
    texts = []
    for df in dfs:
        for _, row in df.iterrows():
            # prefer EVIDENCES -> question_en mapping if present/available
            if 'EVIDENCES' in df.columns and isinstance(row.get('EVIDENCES', ""), str) and row['EVIDENCES'].strip() != "":
                txt = get_symptom_text(row['EVIDENCES'], evidences_df)
                if txt:
                    texts.append(txt)
                    continue
            # fallback to PATHOLOGY column (string)
            if 'PATHOLOGY' in df.columns and not pd.isna(row.get('PATHOLOGY', None)):
                texts.append(str(row['PATHOLOGY']))
            else:
                texts.append("")  # keep alignment for counts
    return texts

# build full_texts (ordered: train, validate, test)
import pandas as pd
full_texts = build_text_list(train_df, validate_df, test_df, evidences_df)
n_samples = len(full_texts)
print(f"Total texts to embed (train+validate+test): {n_samples:,}")

# ---------------- Batched embedding (per-batch save) ----------------
def embed_and_save_in_batches(text_list, tokenizer, model, model_tag, batch_size=64, out_dir=OUT_DIR):
    model = model.to(device)
    model.eval()
    chunk_files = []
    total_batches = (len(text_list) + batch_size - 1) // batch_size

    start_time = time.time()
    processed = 0

    for batch_idx in tqdm(range(total_batches), desc=f"{model_tag} batches"):
        i = batch_idx * batch_size
        batch_texts = text_list[i : i + batch_size]
        # tokenizer -> tensors, move to device
        inputs = tokenizer(
            list(batch_texts),
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # (batch_size, hidden_dim)

        # Save this chunk immediately
        chunk_fname = os.path.join(out_dir, f"{model_tag}_emb_chunk_{batch_idx:05d}.npy")
        np.save(chunk_fname, cls_embeddings)
        chunk_files.append(chunk_fname)

        processed += cls_embeddings.shape[0]
        # print a lightweight progress summary every 100 batches
        if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == total_batches:
            elapsed = time.time() - start_time
            avg_per_batch = elapsed / (batch_idx + 1)
            remaining_batches = total_batches - (batch_idx + 1)
            eta = avg_per_batch * remaining_batches
            print(f"{model_tag} processed {processed}/{len(text_list)} rows — elapsed {elapsed:.1f}s, ETA {eta/60:.2f}min")

    total_time = time.time() - start_time
    print(f"\n{model_tag} finished. Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes).")
    return chunk_files, total_time

# ---------------- Run embeddings for both models and save chunks ----------------
all_times = {}

# Clinical ModernBERT (CMB)
cmb_chunks, cmb_time = embed_and_save_in_batches(
    full_texts, tokenizer1, CMB, model_tag="CMB", batch_size=BATCH_SIZE, out_dir=OUT_DIR
)
all_times['CMB'] = cmb_time

# BioClinical ModernBERT (BMB)
bmb_chunks, bmb_time = embed_and_save_in_batches(
    full_texts, tokenizer2, BMB, model_tag="BMB", batch_size=BATCH_SIZE, out_dir=OUT_DIR
)
all_times['BMB'] = bmb_time

# ---------------- Concatenate chunk files into final single .npy (streaming) ----------------
def concatenate_chunks(chunk_files, final_path):
    # load first to get dtype & shape
    if not chunk_files:
        raise ValueError("No chunk files to concatenate.")
    # compute total rows
    total_rows = 0
    for f in chunk_files:
        arr = np.load(f, mmap_mode='r')
        total_rows += arr.shape[0]
    # infer dim from first file
    first = np.load(chunk_files[0], mmap_mode='r')
    emb_dim = first.shape[1]

    # create memmap to write final output without storing all in memory
    final_np = np.memmap(final_path, dtype=first.dtype, mode='w+', shape=(total_rows, emb_dim))
    pos = 0
    for f in chunk_files:
        arr = np.load(f)
        final_np[pos: pos + arr.shape[0]] = arr
        pos += arr.shape[0]
    # flush to disk
    del final_np

# final filenames
cmb_final = os.path.join(OUT_DIR, "cmb_embeddings_full.npy")
bmb_final = os.path.join(OUT_DIR, "bmb_embeddings_full.npy")

print("\nConcatenating CMB chunks into final file...")
conc_start = time.time()
concatenate_chunks(cmb_chunks, cmb_final)
conc_time_cmb = time.time() - conc_start
print(f"CMB concatenation time: {conc_time_cmb:.2f} sec. Final file: {cmb_final}")

print("\nConcatenating BMB chunks into final file...")
conc_start = time.time()
concatenate_chunks(bmb_chunks, bmb_final)
conc_time_bmb = time.time() - conc_start
print(f"BMB concatenation time: {conc_time_bmb:.2f} sec. Final file: {bmb_final}")

# ---------------- Summary & verification ----------------
total_time = all_times['CMB'] + all_times['BMB'] + conc_time_cmb + conc_time_bmb
print("\n----- SUMMARY -----")
print(f"Samples embedded: {n_samples:,}")
print(f"CMB embedding time: {all_times['CMB']:.2f} s ({all_times['CMB']/60:.2f} min)")
print(f"BMB embedding time: {all_times['BMB']:.2f} s ({all_times['BMB']/60:.2f} min)")
print(f"CMB concatenation time: {conc_time_cmb:.2f} s")
print(f"BMB concatenation time: {conc_time_bmb:.2f} s")
print(f"Total elapsed time (including concatenation): {total_time/60:.2f} minutes")

# quick shape check
cmb_arr = np.load(cmb_final, mmap_mode='r')
bmb_arr = np.load(bmb_final, mmap_mode='r')
print("\nFinal shapes (memmap):")
print("CMB:", cmb_arr.shape)
print("BMB:", bmb_arr.shape)

In [None]:
import os
import shutil

# Path where your final embeddings are
embeddings_dir = "/kaggle/working/embeddings"

# Check if files are there
print("Files in embeddings_dir:", os.listdir(embeddings_dir))

# Move them to /kaggle/working (Kaggle only lets you create dataset from /kaggle/working root)
for fname in os.listdir(embeddings_dir):
    shutil.copy(os.path.join(embeddings_dir, fname), f"/kaggle/working/{fname}")

print("Copied to /kaggle/working for dataset saving.")

In [None]:
import os, shutil

embeddings_dir = "/kaggle/working/embeddings"

for fname in os.listdir(embeddings_dir):
    shutil.move(os.path.join(embeddings_dir, fname), f"/kaggle/working/{fname}")

print("Moved embeddings to /kaggle/working.")

In [None]:
import os

embeddings_dir = '/kaggle/working/embeddings'

if os.path.exists(embeddings_dir):
    print(f"Directory '{embeddings_dir}' exists.")
    files = os.listdir(embeddings_dir)
    print(f"Number of files: {len(files)}")
    print("Files:", files)
else:
    print(f"Directory '{embeddings_dir}' does NOT exist.")

In [None]:
import os

working_dir = '/kaggle/working'

if os.path.exists(working_dir):
    files = os.listdir(working_dir)
    if files:
        print(f"Files found in {working_dir}:")
        for f in files:
            print(f" - {f}")
    else:
        print(f"No files found in {working_dir}.")
else:
    print(f"Directory {working_dir} does not exist.")

In [None]:
!df -h /kaggle/working

In [None]:
import os
import shutil

source_dir = '/kaggle/working/embeddings'  # Your embeddings chunk files here
target_root = '/kaggle/working/split_embeddings'

max_size_bytes = 1.5 * 1024**3  # 1.5 GB

os.makedirs(target_root, exist_ok=True)

current_folder_idx = 1
current_folder_path = os.path.join(target_root, f'embeddings_part_{current_folder_idx:02d}')
os.makedirs(current_folder_path, exist_ok=True)

current_size = 0

for fname in sorted(os.listdir(source_dir)):
    fpath = os.path.join(source_dir, fname)
    fsize = os.path.getsize(fpath)

    # If adding this file exceeds size limit, start new folder
    if current_size + fsize > max_size_bytes:
        current_folder_idx += 1
        current_folder_path = os.path.join(target_root, f'embeddings_part_{current_folder_idx:02d}')
        os.makedirs(current_folder_path, exist_ok=True)
        current_size = 0

    # Move file to current folder
    shutil.move(fpath, os.path.join(current_folder_path, fname))
    current_size += fsize

print(f"Split embeddings into {current_folder_idx} folders under {target_root}")

In [None]:
import os

working_dir = '/kaggle/working'

files = [f for f in os.listdir(working_dir) if os.path.isfile(os.path.join(working_dir, f))]

print(f"Number of files in {working_dir}: {len(files)}")

In [None]:
import os
import shutil

source_dir = '/kaggle/working'
target_root = '/kaggle/working/split_chunks'

os.makedirs(target_root, exist_ok=True)

files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
files.sort()

files_per_folder = 3000
folder_count = 1

for i in range(0, len(files), files_per_folder):
    folder_name = f'embeddings_part_{folder_count:03d}'
    folder_path = os.path.join(target_root, folder_name)
    os.makedirs(folder_path, exist_ok=True)

    batch_files = files[i:i+files_per_folder]
    for f in batch_files:
        shutil.move(os.path.join(source_dir, f), os.path.join(folder_path, f))

    print(f"Moved {len(batch_files)} files to {folder_name}")
    folder_count += 1

print(f"Done splitting files into {folder_count - 1} folders at {target_root}")

In [None]:
import os

# Path where split chunk folders should be
split_path = "/kaggle/working/split_chunks"

if os.path.exists(split_path):
    folders = sorted(os.listdir(split_path))
    print(f"Found {len(folders)} folders in split_chunks:\n")
    for f in folders:
        folder_path = os.path.join(split_path, f)
        file_count = len(os.listdir(folder_path))
        print(f"{f} - {file_count} files")
else:
    print("split_chunks folder not found.")

In [None]:
# List categorical columns from all required DataFrames

def get_categorical_columns(df, df_name):
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    print(f"{df_name} categorical columns ({len(cat_cols)}): {cat_cols}")
    return cat_cols

train_cat_cols = get_categorical_columns(train_df, "Train")
validate_cat_cols = get_categorical_columns(validate_df, "Validate")
test_cat_cols = get_categorical_columns(test_df, "Test")

In [20]:
import numpy as np
import pandas as pd
import os
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, 
    confusion_matrix, top_k_accuracy_score
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Step 1: Find and inspect embedding files
import os
import glob
import numpy as np

print("=== STEP 1: FINDING YOUR EMBEDDING FILES ===")

# Check what's in your working directory
working_dir = '/kaggle/working'
print(f"\nContents of {working_dir}:")
items = os.listdir(working_dir)
for item in sorted(items):
    item_path = os.path.join(working_dir, item)
    if os.path.isdir(item_path):
        print(f"📁 {item}/")
    else:
        size_mb = os.path.getsize(item_path) / (1024*1024)
        print(f"📄 {item} ({size_mb:.1f} MB)")

# Look for any .npy files recursively
print(f"\n=== SEARCHING FOR .NPY FILES ===")
all_npy_files = glob.glob(f"{working_dir}/**/*.npy", recursive=True)
print(f"Found {len(all_npy_files)} .npy files total:")

cmb_files = []
bmb_files = []

for file in sorted(all_npy_files):
    size_mb = os.path.getsize(file) / (1024*1024)
    rel_path = file.replace(working_dir, "")
    print(f"  {rel_path} ({size_mb:.1f} MB)")
    
    if 'CMB' in file:
        cmb_files.append(file)
    if 'BMB' in file:
        bmb_files.append(file)

print(f"\n=== SUMMARY ===")
print(f"CMB files found: {len(cmb_files)}")
print(f"BMB files found: {len(bmb_files)}")

# Check if we have final embeddings already
final_cmb = os.path.join(working_dir, 'cmb_embeddings_full.npy')
final_bmb = os.path.join(working_dir, 'bmb_embeddings_full.npy')

if os.path.exists(final_cmb) and os.path.exists(final_bmb):
    print(f"\n✅ FOUND FINAL EMBEDDINGS:")
    cmb_size = os.path.getsize(final_cmb) / (1024*1024)
    bmb_size = os.path.getsize(final_bmb) / (1024*1024)
    print(f"  cmb_embeddings_full.npy ({cmb_size:.1f} MB)")
    print(f"  bmb_embeddings_full.npy ({bmb_size:.1f} MB)")
    
    # Quick shape check
    try:
        cmb_shape = np.load(final_cmb, mmap_mode='r').shape
        bmb_shape = np.load(final_bmb, mmap_mode='r').shape
        print(f"  CMB shape: {cmb_shape}")
        print(f"  BMB shape: {bmb_shape}")
        print(f"  ✅ Final embeddings are ready to use!")
    except Exception as e:
        print(f"  ❌ Error reading final embeddings: {e}")
        
else:
    print(f"\n⚠️  FINAL EMBEDDINGS NOT FOUND")
    print(f"Need to reconstruct from {len(cmb_files)} CMB chunks and {len(bmb_files)} BMB chunks")

# Test load one chunk file to see what we're dealing with
if cmb_files:
    print(f"\n=== TESTING ONE CHUNK FILE ===")
    test_file = cmb_files[0]
    try:
        test_chunk = np.load(test_file)
        print(f"✅ Successfully loaded: {test_file}")
        print(f"   Shape: {test_chunk.shape}")
        print(f"   Dtype: {test_chunk.dtype}")
    except Exception as e:
        print(f"❌ Error loading {test_file}: {e}")

=== STEP 1: FINDING YOUR EMBEDDING FILES ===

Contents of /kaggle/working:
📁 .virtual_documents/
📁 embeddings/
📁 split_chunks/
📄 state.db (1635.9 MB)

=== SEARCHING FOR .NPY FILES ===
Found 38848 .npy files total:
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00000.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00001.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00002.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00003.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00004.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00005.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00006.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00007.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00008.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00009.npy (0.2 MB)
  /split_chunks/embeddings_part_001/BMB_emb_chunk_00010.npy (0.2 MB)
  /split_chunks/embeddings_