In [1]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel
import torch
from pytorch_tabular.models.tab_transformer import TabTransformerModel
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Step 1: Load Dataset
def load_data(file_path):
    """Loads the dataset and provides an initial overview."""
    data = pd.read_csv(file_path)
    print("Initial Dataset Info:\n", data.info())
    print("\nSample Data:\n", data.head())
    return data
data = load_data("C:\\Users\\sriha\\Music\\Case Comps\\NEST\\Data\\category_updated.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257577 entries, 0 to 257576
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  257577 non-null  int64  
 1   NCT Number                  257577 non-null  object 
 2   Study Title                 257577 non-null  object 
 3   Study Status                257577 non-null  object 
 4   Brief Summary               257577 non-null  object 
 5   Conditions                  257577 non-null  object 
 6   Primary Outcome Measures    247086 non-null  object 
 7   Secondary Outcome Measures  185779 non-null  object 
 8   Other Outcome Measures      18272 non-null   object 
 9   Sponsor                     257577 non-null  object 
 10  Collaborators               83679 non-null   object 
 11  Sex                         257317 non-null  object 
 12  Age                         257577 non-null  object 
 13  Phases        

In [3]:
data= data.head(5)
data.columns

Index(['Unnamed: 0', 'NCT Number', 'Study Title', 'Study Status',
       'Brief Summary', 'Conditions', 'Primary Outcome Measures',
       'Secondary Outcome Measures', 'Other Outcome Measures', 'Sponsor',
       'Collaborators', 'Sex', 'Age', 'Phases', 'Enrollment', 'Funder Type',
       'Study Type', 'Study Design', 'Start Month', 'Start Quarter',
       'Condition Category', 'Conditions_Category'],
      dtype='object')

In [3]:
"""Extracts embeddings for text data using the specified BERT model."""
biotokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biomodel = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [5]:
def bio_extract_text_embeddings(text_list, batch_size=512):
    """Extracts embeddings for text data using BioBERT with batch processing on GPU."""
    embeddings = []
    
    dataloader = DataLoader(text_list, batch_size=batch_size, shuffle=False)
    
    for batch_texts in tqdm(dataloader, desc="Processing Batches", unit="batch"):
        inputs = biotokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        
        with torch.no_grad():  # No gradient tracking for inference
            outputs = biomodel(**inputs)
        
        # Compute mean pooling of token embeddings (Sentence Representation)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

In [4]:
# Combine all text attributes into a single column 'Unstructured'
data["Unstructured"] = data[
    ["Brief Summary", "Study Title", "Primary Outcome Measures", "Secondary Outcome Measures"]
].astype(str).agg(" [SEP] ".join, axis=1)

# Drop the original text columns
data = data.drop(columns=["Brief Summary", "Study Title", "Primary Outcome Measures", "Secondary Outcome Measures"])

# Display updated DataFrame
print(data)

        Unnamed: 0   NCT Number   Study Status  \
0                0  NCT00559130      Completed   
1                1  NCT00937664  Not_Completed   
2                2  NCT00441597      Completed   
3                3  NCT03296228      Completed   
4                4  NCT00421603      Completed   
...            ...          ...            ...   
257572      257572  NCT02360800      Completed   
257573      257573  NCT02352506      Completed   
257574      257574  NCT04996381      Completed   
257575      257575  NCT00380640      Completed   
257576      257576  NCT01844336      Completed   

                                               Conditions  \
0       Acute Respiratory Distress Syndrome|Acute Lung...   
1         Cancer|Solid Tumors|Advanced Solid Malignancies   
2       Ischemia Reperfusion Injury|Cardiovascular Dis...   
3                         Adolescent Idiopathic Scoliosis   
4                                      Cocaine Dependence   
...                              

In [17]:
torch.cuda.empty_cache()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [14]:
data =data.head()

In [5]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

# Load BioBERT model & tokenizer
biotokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biomodel = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to("cuda")  # Move model to GPU

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to extract BioBERT embeddings in GPU-friendly batches
def bio_extract_text_embeddings(text_list, batch_size=512):
    """Extracts embeddings for text data using BioBERT with batch processing on GPU."""
    embeddings = []
    
    dataloader = DataLoader(text_list, batch_size=batch_size, shuffle=False)

    for batch_texts in tqdm(dataloader, desc="Processing Batches", unit="batch"):
        try:
            # Tokenization and moving inputs to GPU
            inputs = biotokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: val.to(device) for key, val in inputs.items()}  # Ensure all tensors are on GPU
            
            with torch.no_grad():  # No gradient tracking for inference
                outputs = biomodel(**inputs)

            # Compute mean pooling of token embeddings (Sentence Representation)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Stays on GPU

            embeddings.append(batch_embeddings)

            # Free GPU memory after every batch
            del inputs, outputs
            torch.cuda.empty_cache()

        except torch.cuda.OutOfMemoryError:
            print("\n⚠️ CUDA OOM Error: Reducing batch size and retrying...\n")
            torch.cuda.empty_cache()
            return bio_extract_text_embeddings(text_list, batch_size=max(batch_size // 2, 1))  # Reduce batch size & retry

    return torch.cat(embeddings, dim=0).cpu().numpy()  # Move embeddings to CPU & convert to NumPy

# Split dataset into 100 chunks
num_chunks = 100
chunk_size = len(data) // num_chunks

for i in range(num_chunks):
    print(f"\n🚀 Processing chunk {i+1}/{num_chunks}...")

    # Select subset of data for this chunk
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i != num_chunks - 1 else len(data)  # Last chunk gets remaining data
    chunk = data.iloc[start_idx:end_idx].copy()  # Use .copy() to avoid warnings

    # Convert text column to list
    bio_text_data = chunk["Unstructured"].astype(str).tolist()

    # Extract embeddings using batch processing on GPU
    embeddings_cpu = bio_extract_text_embeddings(bio_text_data, batch_size=512)  # Returns NumPy array

    # Create DataFrame with embedding columns
    bioembedding_df = pd.DataFrame(embeddings_cpu, index=chunk.index)
    bioembedding_df.columns = [f"Unstructured_embed_{j}" for j in range(embeddings_cpu.shape[1])]

    # Merge embeddings with original text in the chunk
    chunk = pd.concat([chunk, bioembedding_df], axis=1)

    # Save the chunk with both text and embeddings
    output_filename = f"NEST_chunk_{i+1}.csv"
    chunk.to_csv(output_filename, index=True)
    
    print(f"✅ Saved chunk {i+1} to {output_filename}")

    # Free GPU memory after each chunk
    del embeddings_cpu, bioembedding_df, chunk
    torch.cuda.empty_cache()

print("\n🎉 All chunks successfully saved as separate CSV files!")


Using device: cuda

🚀 Processing chunk 1/100...


Processing Batches: 100%|██████████| 6/6 [02:22<00:00, 23.70s/batch]


✅ Saved chunk 1 to NEST_chunk_1.csv

🚀 Processing chunk 2/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.25s/batch]


✅ Saved chunk 2 to NEST_chunk_2.csv

🚀 Processing chunk 3/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 3 to NEST_chunk_3.csv

🚀 Processing chunk 4/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 4 to NEST_chunk_4.csv

🚀 Processing chunk 5/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.14s/batch]


✅ Saved chunk 5 to NEST_chunk_5.csv

🚀 Processing chunk 6/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.15s/batch]


✅ Saved chunk 6 to NEST_chunk_6.csv

🚀 Processing chunk 7/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.13s/batch]


✅ Saved chunk 7 to NEST_chunk_7.csv

🚀 Processing chunk 8/100...


Processing Batches: 100%|██████████| 6/6 [02:20<00:00, 23.46s/batch]


✅ Saved chunk 8 to NEST_chunk_8.csv

🚀 Processing chunk 9/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.14s/batch]


✅ Saved chunk 9 to NEST_chunk_9.csv

🚀 Processing chunk 10/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.13s/batch]


✅ Saved chunk 10 to NEST_chunk_10.csv

🚀 Processing chunk 11/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.25s/batch]


✅ Saved chunk 11 to NEST_chunk_11.csv

🚀 Processing chunk 12/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.13s/batch]


✅ Saved chunk 12 to NEST_chunk_12.csv

🚀 Processing chunk 13/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.22s/batch]


✅ Saved chunk 13 to NEST_chunk_13.csv

🚀 Processing chunk 14/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 14 to NEST_chunk_14.csv

🚀 Processing chunk 15/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 15 to NEST_chunk_15.csv

🚀 Processing chunk 16/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 16 to NEST_chunk_16.csv

🚀 Processing chunk 17/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.23s/batch]


✅ Saved chunk 17 to NEST_chunk_17.csv

🚀 Processing chunk 18/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.16s/batch]


✅ Saved chunk 18 to NEST_chunk_18.csv

🚀 Processing chunk 19/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.13s/batch]


✅ Saved chunk 19 to NEST_chunk_19.csv

🚀 Processing chunk 20/100...


Processing Batches: 100%|██████████| 6/6 [16:49<00:00, 168.24s/batch]  


✅ Saved chunk 20 to NEST_chunk_20.csv

🚀 Processing chunk 21/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 21 to NEST_chunk_21.csv

🚀 Processing chunk 22/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.24s/batch]


✅ Saved chunk 22 to NEST_chunk_22.csv

🚀 Processing chunk 23/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.24s/batch]


✅ Saved chunk 23 to NEST_chunk_23.csv

🚀 Processing chunk 24/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 24 to NEST_chunk_24.csv

🚀 Processing chunk 25/100...


Processing Batches: 100%|██████████| 6/6 [02:20<00:00, 23.35s/batch]


✅ Saved chunk 25 to NEST_chunk_25.csv

🚀 Processing chunk 26/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 26 to NEST_chunk_26.csv

🚀 Processing chunk 27/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 27 to NEST_chunk_27.csv

🚀 Processing chunk 28/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 28 to NEST_chunk_28.csv

🚀 Processing chunk 29/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 29 to NEST_chunk_29.csv

🚀 Processing chunk 30/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.24s/batch]


✅ Saved chunk 30 to NEST_chunk_30.csv

🚀 Processing chunk 31/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.23s/batch]


✅ Saved chunk 31 to NEST_chunk_31.csv

🚀 Processing chunk 32/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 32 to NEST_chunk_32.csv

🚀 Processing chunk 33/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.22s/batch]


✅ Saved chunk 33 to NEST_chunk_33.csv

🚀 Processing chunk 34/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.17s/batch]


✅ Saved chunk 34 to NEST_chunk_34.csv

🚀 Processing chunk 35/100...


Processing Batches: 100%|██████████| 6/6 [12:26<00:00, 124.45s/batch]


✅ Saved chunk 35 to NEST_chunk_35.csv

🚀 Processing chunk 36/100...


Processing Batches: 100%|██████████| 6/6 [02:22<00:00, 23.78s/batch]


✅ Saved chunk 36 to NEST_chunk_36.csv

🚀 Processing chunk 37/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 37 to NEST_chunk_37.csv

🚀 Processing chunk 38/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 38 to NEST_chunk_38.csv

🚀 Processing chunk 39/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 39 to NEST_chunk_39.csv

🚀 Processing chunk 40/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 40 to NEST_chunk_40.csv

🚀 Processing chunk 41/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 41 to NEST_chunk_41.csv

🚀 Processing chunk 42/100...


Processing Batches: 100%|██████████| 6/6 [29:44<00:00, 297.44s/batch]


✅ Saved chunk 42 to NEST_chunk_42.csv

🚀 Processing chunk 43/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 43 to NEST_chunk_43.csv

🚀 Processing chunk 44/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.23s/batch]


✅ Saved chunk 44 to NEST_chunk_44.csv

🚀 Processing chunk 45/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.30s/batch]


✅ Saved chunk 45 to NEST_chunk_45.csv

🚀 Processing chunk 46/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.22s/batch]


✅ Saved chunk 46 to NEST_chunk_46.csv

🚀 Processing chunk 47/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 47 to NEST_chunk_47.csv

🚀 Processing chunk 48/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.22s/batch]


✅ Saved chunk 48 to NEST_chunk_48.csv

🚀 Processing chunk 49/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 49 to NEST_chunk_49.csv

🚀 Processing chunk 50/100...


Processing Batches: 100%|██████████| 6/6 [02:21<00:00, 23.56s/batch]


✅ Saved chunk 50 to NEST_chunk_50.csv

🚀 Processing chunk 51/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.16s/batch]


✅ Saved chunk 51 to NEST_chunk_51.csv

🚀 Processing chunk 52/100...


Processing Batches: 100%|██████████| 6/6 [5:36:34<00:00, 3365.80s/batch]    


✅ Saved chunk 52 to NEST_chunk_52.csv

🚀 Processing chunk 53/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 53 to NEST_chunk_53.csv

🚀 Processing chunk 54/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 54 to NEST_chunk_54.csv

🚀 Processing chunk 55/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.16s/batch]


✅ Saved chunk 55 to NEST_chunk_55.csv

🚀 Processing chunk 56/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 56 to NEST_chunk_56.csv

🚀 Processing chunk 57/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 57 to NEST_chunk_57.csv

🚀 Processing chunk 58/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 58 to NEST_chunk_58.csv

🚀 Processing chunk 59/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 59 to NEST_chunk_59.csv

🚀 Processing chunk 60/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.15s/batch]


✅ Saved chunk 60 to NEST_chunk_60.csv

🚀 Processing chunk 61/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.16s/batch]


✅ Saved chunk 61 to NEST_chunk_61.csv

🚀 Processing chunk 62/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 62 to NEST_chunk_62.csv

🚀 Processing chunk 63/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 63 to NEST_chunk_63.csv

🚀 Processing chunk 64/100...


Processing Batches: 100%|██████████| 6/6 [12:07<00:00, 121.24s/batch]


✅ Saved chunk 64 to NEST_chunk_64.csv

🚀 Processing chunk 65/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.28s/batch]


✅ Saved chunk 65 to NEST_chunk_65.csv

🚀 Processing chunk 66/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 66 to NEST_chunk_66.csv

🚀 Processing chunk 67/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.17s/batch]


✅ Saved chunk 67 to NEST_chunk_67.csv

🚀 Processing chunk 68/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.17s/batch]


✅ Saved chunk 68 to NEST_chunk_68.csv

🚀 Processing chunk 69/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 69 to NEST_chunk_69.csv

🚀 Processing chunk 70/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.13s/batch]


✅ Saved chunk 70 to NEST_chunk_70.csv

🚀 Processing chunk 71/100...


Processing Batches: 100%|██████████| 6/6 [11:22<00:00, 113.76s/batch]


✅ Saved chunk 71 to NEST_chunk_71.csv

🚀 Processing chunk 72/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.21s/batch]


✅ Saved chunk 72 to NEST_chunk_72.csv

🚀 Processing chunk 73/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 73 to NEST_chunk_73.csv

🚀 Processing chunk 74/100...


Processing Batches: 100%|██████████| 6/6 [02:20<00:00, 23.35s/batch]


✅ Saved chunk 74 to NEST_chunk_74.csv

🚀 Processing chunk 75/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 75 to NEST_chunk_75.csv

🚀 Processing chunk 76/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.19s/batch]


✅ Saved chunk 76 to NEST_chunk_76.csv

🚀 Processing chunk 77/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 77 to NEST_chunk_77.csv

🚀 Processing chunk 78/100...


Processing Batches: 100%|██████████| 6/6 [20:35<00:00, 205.94s/batch]


✅ Saved chunk 78 to NEST_chunk_78.csv

🚀 Processing chunk 79/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.22s/batch]


✅ Saved chunk 79 to NEST_chunk_79.csv

🚀 Processing chunk 80/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.24s/batch]


✅ Saved chunk 80 to NEST_chunk_80.csv

🚀 Processing chunk 81/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.27s/batch]


✅ Saved chunk 81 to NEST_chunk_81.csv

🚀 Processing chunk 82/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.20s/batch]


✅ Saved chunk 82 to NEST_chunk_82.csv

🚀 Processing chunk 83/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.18s/batch]


✅ Saved chunk 83 to NEST_chunk_83.csv

🚀 Processing chunk 84/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.22s/batch]


✅ Saved chunk 84 to NEST_chunk_84.csv

🚀 Processing chunk 85/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.17s/batch]


✅ Saved chunk 85 to NEST_chunk_85.csv

🚀 Processing chunk 86/100...


Processing Batches: 100%|██████████| 6/6 [02:24<00:00, 24.07s/batch]


✅ Saved chunk 86 to NEST_chunk_86.csv

🚀 Processing chunk 87/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.77s/batch]


✅ Saved chunk 87 to NEST_chunk_87.csv

🚀 Processing chunk 88/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.72s/batch]


✅ Saved chunk 88 to NEST_chunk_88.csv

🚀 Processing chunk 89/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.74s/batch]


✅ Saved chunk 89 to NEST_chunk_89.csv

🚀 Processing chunk 90/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.71s/batch]


✅ Saved chunk 90 to NEST_chunk_90.csv

🚀 Processing chunk 91/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.74s/batch]


✅ Saved chunk 91 to NEST_chunk_91.csv

🚀 Processing chunk 92/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.74s/batch]


✅ Saved chunk 92 to NEST_chunk_92.csv

🚀 Processing chunk 93/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.08s/batch]


✅ Saved chunk 93 to NEST_chunk_93.csv

🚀 Processing chunk 94/100...


Processing Batches: 100%|██████████| 6/6 [02:22<00:00, 23.73s/batch]


✅ Saved chunk 94 to NEST_chunk_94.csv

🚀 Processing chunk 95/100...


Processing Batches: 100%|██████████| 6/6 [02:21<00:00, 23.60s/batch]


✅ Saved chunk 95 to NEST_chunk_95.csv

🚀 Processing chunk 96/100...


Processing Batches: 100%|██████████| 6/6 [02:18<00:00, 23.05s/batch]


✅ Saved chunk 96 to NEST_chunk_96.csv

🚀 Processing chunk 97/100...


Processing Batches: 100%|██████████| 6/6 [02:19<00:00, 23.30s/batch]


✅ Saved chunk 97 to NEST_chunk_97.csv

🚀 Processing chunk 98/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.82s/batch]


✅ Saved chunk 98 to NEST_chunk_98.csv

🚀 Processing chunk 99/100...


Processing Batches: 100%|██████████| 6/6 [02:16<00:00, 22.76s/batch]


✅ Saved chunk 99 to NEST_chunk_99.csv

🚀 Processing chunk 100/100...


Processing Batches: 100%|██████████| 6/6 [02:17<00:00, 22.84s/batch]


✅ Saved chunk 100 to NEST_chunk_100.csv

🎉 All chunks successfully saved as separate CSV files!


In [8]:
bioembedding_df_1 = pd.read_csv('NEST_embed_chunk_1.csv',)
bioembedding_df_1

Unnamed: 0.1,Unnamed: 0,Unstructured_embed_0,Unstructured_embed_1,Unstructured_embed_2,Unstructured_embed_3,Unstructured_embed_4,Unstructured_embed_5,Unstructured_embed_6,Unstructured_embed_7,Unstructured_embed_8,...,Unstructured_embed_758,Unstructured_embed_759,Unstructured_embed_760,Unstructured_embed_761,Unstructured_embed_762,Unstructured_embed_763,Unstructured_embed_764,Unstructured_embed_765,Unstructured_embed_766,Unstructured_embed_767
0,0,-0.173068,-0.077015,-0.525726,-0.037300,-0.203847,0.030538,-0.040831,0.19147,0.245301,...,-0.253266,0.127309,-0.11036,0.348753,-0.166995,0.212118,0.242659,-0.229306,-0.052761,-0.388725
1,1,-0.199231,-0.065837,-0.115439,-0.000676,-0.122404,0.003565,-0.092257,-0.12714,0.089963,...,-0.047347,0.167718,0.06174,0.211448,0.024834,0.416763,0.272169,-0.318060,0.294743,-0.060007
2,2,-0.173068,-0.077015,-0.525726,-0.037300,-0.203847,0.030538,-0.040831,0.19147,0.245301,...,-0.253266,0.127309,-0.11036,0.348753,-0.166995,0.212118,0.242659,-0.229306,-0.052761,-0.388725
3,3,-0.199231,-0.065837,-0.115439,-0.000676,-0.122404,0.003565,-0.092257,-0.12714,0.089963,...,-0.047347,0.167718,0.06174,0.211448,0.024834,0.416763,0.272169,-0.318060,0.294743,-0.060007
4,4,-0.173068,-0.077015,-0.525726,-0.037300,-0.203847,0.030538,-0.040831,0.19147,0.245301,...,-0.253266,0.127309,-0.11036,0.348753,-0.166995,0.212118,0.242659,-0.229306,-0.052761,-0.388725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360,1360,-0.173068,-0.077015,-0.525726,-0.037300,-0.203847,0.030538,-0.040831,0.19147,0.245301,...,-0.253266,0.127309,-0.11036,0.348753,-0.166995,0.212118,0.242659,-0.229306,-0.052761,-0.388725
1361,1361,-0.199231,-0.065837,-0.115439,-0.000676,-0.122404,0.003565,-0.092257,-0.12714,0.089963,...,-0.047347,0.167718,0.06174,0.211448,0.024834,0.416763,0.272169,-0.318060,0.294743,-0.060007
1362,1362,-0.173068,-0.077015,-0.525726,-0.037300,-0.203847,0.030538,-0.040831,0.19147,0.245301,...,-0.253266,0.127309,-0.11036,0.348753,-0.166995,0.212118,0.242659,-0.229306,-0.052761,-0.388725
1363,1363,-0.199231,-0.065837,-0.115439,-0.000676,-0.122404,0.003565,-0.092257,-0.12714,0.089963,...,-0.047347,0.167718,0.06174,0.211448,0.024834,0.416763,0.272169,-0.318060,0.294743,-0.060007


In [19]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

# Load BioBERT model & tokenizer
biotokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biomodel = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to("cuda")  # Move model to GPU

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to extract BioBERT embeddings in GPU-friendly batches with memory control
def bio_extract_text_embeddings(text_list, batch_size=256):
    """Extracts embeddings for text data using BioBERT with batch processing on GPU while handling OOM errors."""
    embeddings = []
    
    dataloader = DataLoader(text_list, batch_size=batch_size, shuffle=False)

    for batch_texts in tqdm(dataloader, desc="Processing Batches", unit="batch"):
        try:
            # Tokenization and moving inputs to GPU
            inputs = biotokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: val.to(device) for key, val in inputs.items()}  # Move all tensors to GPU

            with torch.no_grad():  # No gradient tracking for inference
                outputs = biomodel(**inputs)

            # Compute mean pooling of token embeddings (Sentence Representation)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Keep on GPU
            
            embeddings.append(batch_embeddings)

            # Free GPU memory after each batch
            del inputs, outputs
            torch.cuda.empty_cache()

        except torch.cuda.OutOfMemoryError:
            print("\n⚠️ CUDA OOM Error: Reducing batch size to avoid crash.\n")
            torch.cuda.empty_cache()
            return bio_extract_text_embeddings(text_list, batch_size=max(batch_size // 2, 1))  # Reduce batch size & retry

    return torch.cat(embeddings, dim=0).cpu().numpy()  # Ensure output is on CPU

# Split dataset into 15 chunks
num_chunks = 15
chunk_size = len(data) // num_chunks

for i in range(num_chunks):
    print(f"\n🚀 Processing chunk {i+1}/{num_chunks}...")

    # Select subset of data for this chunk
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i != num_chunks - 1 else len(data)  # Last chunk gets remaining data
    chunk = data.iloc[start_idx:end_idx]

    # Convert text column to list
    bio_text_data = chunk["Unstructured"].astype(str).tolist()

    # Extract embeddings using batch processing on GPU
    embeddings_cpu = bio_extract_text_embeddings(bio_text_data, batch_size=256)  # Dynamically adjusts batch size

    # Create DataFrame with embedding columns
    bioembedding_df = pd.DataFrame(embeddings_cpu, index=chunk.index)
    bioembedding_df.columns = [f"Unstructured_embed_{i}" for i in range(embeddings_cpu.shape[1])]

    # Save each chunk separately to avoid memory overload
    output_filename = f"NEST_embed_chunk_{i+1}.csv"
    bioembedding_df.to_csv(output_filename, index=True)
    
    print(f"✅ Saved chunk {i+1} to {output_filename}")

    # Free GPU memory after each chunk
    del embeddings_cpu, bioembedding_df
    torch.cuda.empty_cache()

print("\n🎉 All embeddings successfully saved in separate CSV files!")

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [15]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

# Load BioBERT model & tokenizer
biotokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biomodel = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to("cuda")  # Move model to GPU

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to extract BioBERT embeddings in GPU-friendly batches
def bio_extract_text_embeddings(text_list, batch_size=512):
    """Extracts embeddings for text data using BioBERT with batch processing on GPU."""
    embeddings = []
    
    dataloader = DataLoader(text_list, batch_size=batch_size, shuffle=False)

    for batch_texts in tqdm(dataloader, desc="Processing Batches", unit="batch"):
        # Tokenization and moving inputs to GPU
        inputs = biotokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}  # Ensure all tensors are on GPU
        
        with torch.no_grad():  # No gradient tracking for inference
            outputs = biomodel(**inputs)
        
        # Compute mean pooling of token embeddings (Sentence Representation)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Stays on GPU
        
        embeddings.append(batch_embeddings)

    return torch.cat(embeddings, dim=0)  # Returns a single tensor on GPU

# Process 'Unstructured' text attribute in 512-size batches
attribute = "Unstructured"
print(f"Processing attribute: {attribute}")

bio_text_data = data[attribute].astype(str).tolist()

# Extract embeddings using batch processing on GPU
embeddings_tensor = bio_extract_text_embeddings(bio_text_data, batch_size=512)  # Returns tensor on GPU

# Move embeddings back to CPU for DataFrame creation
embeddings_cpu = embeddings_tensor.cpu().numpy()

# Create DataFrame with embedding columns
bioembedding_df = pd.DataFrame(embeddings_cpu, index=data.index)
bioembedding_df.columns = [f"{attribute}_embed_{i}" for i in range(embeddings_cpu.shape[1])]

# Drop the original attribute column and merge embeddings
data = data.drop(columns=[attribute])
data = pd.concat([data, bioembedding_df], axis=1)

print("Embeddings successfully replaced the original attributes.")
data.to_csv('NEST_embed.csv')

Using device: cuda
Processing attribute: Unstructured


Processing Batches:   0%|          | 0/49 [00:01<?, ?batch/s]


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [75]:
data.to_csv('NEST_embed.csv')

Unnamed: 0.1,Unnamed: 0,NCT Number,Study Status,Conditions,Other Outcome Measures,Sponsor,Collaborators,Sex,Age,Phases,...,Unstructured_embed_758,Unstructured_embed_759,Unstructured_embed_760,Unstructured_embed_761,Unstructured_embed_762,Unstructured_embed_763,Unstructured_embed_764,Unstructured_embed_765,Unstructured_embed_766,Unstructured_embed_767
0,0,NCT00559130,Completed,Acute Respiratory Distress Syndrome|Acute Lung...,,"MedaSorb Technologies, Inc",,ALL,"ADULT, OLDER_ADULT",,...,-0.098257,0.107153,-0.193596,0.135309,0.092266,-0.020007,-0.063864,0.123716,-0.106573,0.03129
1,1,NCT00937664,Not_Completed,Cancer|Solid Tumors|Advanced Solid Malignancies,,AstraZeneca,,ALL,"ADULT, OLDER_ADULT",PHASE1,...,0.018887,0.115186,-0.080183,0.21976,0.035297,0.03431,0.078921,-0.03946,-0.072698,-0.054465
2,2,NCT00441597,Completed,Ischemia Reperfusion Injury|Cardiovascular Dis...,,Radboud University Medical Center,Pfizer,MALE,ADULT,PHASE4,...,-0.112456,0.116638,-0.116839,0.092405,-0.204348,0.03137,-0.016791,0.158138,-0.017895,0.021559
3,3,NCT03296228,Completed,Adolescent Idiopathic Scoliosis,,The University of Hong Kong,"AO Foundation, AO Spine",ALL,"CHILD, ADULT",,...,-0.013137,-0.063034,0.019267,-0.061944,-0.091516,-0.104973,0.211876,0.175831,-0.115984,-0.035802
4,4,NCT00421603,Completed,Cocaine Dependence,,New York State Psychiatric Institute,National Institute on Drug Abuse (NIDA),ALL,ADULT,PHASE2,...,-0.024513,0.121777,0.032171,0.037329,-0.005663,-0.278086,0.064402,0.270871,-0.001445,0.048874


In [26]:
clinicaltokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
clinicalmodel = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [27]:
def clinical_extract_text_embeddings(text_data):

    embeddings = []
    for text in text_data:
        inputs = clinicaltokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = clinicalmodel(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())

    return np.vstack(embeddings)


In [32]:
data = data.head(2)
data

Unnamed: 0.1,Unnamed: 0,NCT Number,Study Title,Study Status,Brief Summary,Conditions,Primary Outcome Measures,Secondary Outcome Measures,Other Outcome Measures,Sponsor,...,Age,Phases,Enrollment,Funder Type,Study Type,Study Design,Start Month,Start Quarter,Condition Category,Conditions_Category
0,0,NCT00559130,Efficacy Study of CytoSorb Hemoperfusion Devic...,Completed,The hypothesis of this study is use of CytoSor...,Acute Respiratory Distress Syndrome|Acute Lung...,Relative IL-6 levels as a percent (%) of basel...,"Ventilator Free Days, Reduction cytokines TNF-...",,"MedaSorb Technologies, Inc",...,"ADULT, OLDER_ADULT",,100.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,11,4,Other Rare or Unclassified,Other Rare or Unclassified
1,1,NCT00937664,Safety and Tolerability Study of AZD7762 in Co...,Not_Completed,The primary purpose of this study is to find o...,Cancer|Solid Tumors|Advanced Solid Malignancies,Assessment of adverse events (based on CTCAE v...,Pharmacokinetic effect of AZD7762 when adminis...,,AstraZeneca,...,"ADULT, OLDER_ADULT",PHASE1,24.0,INDUSTRY,INTERVENTIONAL,Allocation: NON_RANDOMIZED|Intervention Model:...,7,3,Oncology,Oncology


In [33]:
for attribute in tqdm(['Primary Outcome Measures', 'Secondary Outcome Measures'], desc="Processing Attributes"):
    print(f"Processing attribute: {attribute}")
    
    # Convert text column to string
    clinical_text_data = data[attribute].astype(str).tolist()

    # Extract embeddings and ensure proper NumPy array structure
    embeddings = clinical_extract_text_embeddings(bio_text_data)
    
    # Convert to torch tensor and move to device (ensure dtype compatibility)
    clinical_text_data_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    # Move embeddings back to CPU for DataFrame creation
    clinicalembeddings_cpu = clinical_text_data_tensor.cpu().numpy()

    # Create DataFrame with embedding columns
    clinicalembedding_df = pd.DataFrame(clinicalembeddings_cpu, index=data.index)
    clinicalembedding_df.columns = [f"{attribute}_embed_{i}" for i in range(embeddings_cpu.shape[1])]

    # Drop the original attribute column and merge embeddings
    data = data.drop(columns=[attribute])
    data = pd.concat([data, clinicalembedding_df], axis=1)

print("Embeddings successfully replaced the original attributes.")

Processing Attributes:   0%|          | 0/2 [00:00<?, ?it/s]

Processing attribute: Primary Outcome Measures


Processing Attributes:   0%|          | 0/2 [05:39<?, ?it/s]


KeyboardInterrupt: 

In [51]:
# Step 4: Combine Features and Embeddings
# Combine structured features with embeddings
X_combined = np.hstack((numerical_scaled, categorical_encoded, bio_embeddings, clinical_embeddings))
y = data['Study Status']  # Target column

# Train-Test Split for Combined Data
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)



NameError: name 'clinical_embeddings' is not defined

In [None]:
# Step 5: TabTransformer for Combined Data
def train_tab_transformer(X_train, y_train):
    """Trains a TabTransformer model on the combined structured and text data."""
    model = TabTransformerModel(
        task="classification",
        input_dim=X_train.shape[1],
        n_classes=len(np.unique(y_train)),
        cat_idxs=[],
        cat_dims=[],
        n_dnn_layers=2,
        n_dnn_units=64,
        learning_rate=1e-3,
        batch_size=32,
        epochs=10
    )
    model.fit(X_train, y_train)
    return model

# Train TabTransformer
tab_transformer_model = train_tab_transformer(X_train_combined, y_train)

In [None]:
# Step 6: Evaluate TabTransformer
predictions = tab_transformer_model.predict(X_test_combined)
print("TabTransformer Classification Report:\n", classification_report(y_test, predictions))