In [1]:
from lib.MINDS import MINDS
import pandas as pd
import json
import random
from tqdm import tqdm
from pathlib import Path
import numpy as np
from transformers import AutoModel, AutoTokenizer, AutoConfig
import torch
import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


def generate_summary(patient_data):
    # Extracting basic patient information
    patient_id = patient_data.get("case_submitter_id", "N/A")
    age = patient_data.get("age_at_index", "N/A")
    gender = patient_data.get("gender", "N/A")
    ethnicity = patient_data.get("ethnicity", "N/A")
    diagnosis = patient_data.get("primary_diagnosis", "N/A")
    tumor_stage = patient_data.get("ajcc_pathologic_stage", "N/A")

    # Extracting treatment information
    treatments = patient_data.get("clinical", [])
    treatment_types = ", ".join(t.get("treatment_type", "N/A") for t in treatments)

    # Sample and slide information
    sample_details = patient_data.get("sample", [])
    sample_info = "Sample details: " + "; ".join(
        f"ID: {s.get('sample_id', 'N/A')}, Type: {s.get('sample_type', 'N/A')}, FFPE: {s.get('is_ffpe', 'N/A')}"
        for s in sample_details
    )

    # Smoking and alcohol history
    cigarettes_per_day = patient_data.get("cigarettes_per_day", "N/A")
    pack_years = patient_data.get("pack_years_smoked", "N/A")
    alcohol_history = patient_data.get("alcohol_history", "N/A")

    # Creating the summary
    summary = (
        f"Patient ID: {patient_id}, a {age}-year-old {gender} of {ethnicity} ethnicity, "
        f"was diagnosed with {diagnosis}, staged as {tumor_stage}. "
        f"Treatment administered includes: {treatment_types}. "
        f"{sample_info} "
        f"The patient has a smoking history of {cigarettes_per_day} cigarettes per day over {pack_years} pack-years. "
        f"Alcohol history: {alcohol_history}."
    )
    return summary


def generate_summary_from_json(patient_data):
    # Initialize an empty list to store sentences
    summary_sentences = []

    # Iterate through each key-value pair in the JSON object
    for key, value in patient_data.items():
        # if the key is "case_id" then skip it
        if key == "case_id" or key == "pathology_report_uuid":
            continue

        # remove all _ from the key
        key = key.replace("_", " ")
        sentence = f"{key}: {value};"

        # if the value is a list, then skip it
        if isinstance(value, list):
            continue

        summary_sentences.append(sentence)

    # Compile all sentences into a single summary string
    summary = " ".join(summary_sentences)

    return summary


def process_group(group):
    common_fields = {}
    nested_objects = []
    for col in group.columns:
        unique_values = group[col].dropna().unique()
        if len(unique_values) == 1:
            # If only one unique value exists, it's a common field
            common_fields[col] = unique_values[0]

    # Create nested objects for fields that are not common
    for idx, row in group.iterrows():
        nested_object = {
            col: row[col]
            for col in group.columns
            if col not in common_fields and pd.notna(row[col])
        }
        if nested_object:  # Only add if the nested object is not empty
            nested_objects.append(nested_object)

    return common_fields, nested_objects


def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj


class ClinincalEmbeddings:
    def __init__(self, model):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.config = AutoConfig.from_pretrained(model)
        self.model = AutoModel.from_pretrained(model)
        self.model = self.model.to(device)

    def generate_embeddings(self, sentences):
        inputs = self.tokenizer(
            str(sentences),
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)

        embedding = outputs["last_hidden_state"].cpu().numpy()
        return embedding


class RedisClient:
    def __init__(self):
        self.client = redis.Redis(host="10.0.1.16", port=6379, db=0)
        self.client.ping()
        self.INDEX_NAME = "clinical"
        self.DOC_PREFIX = "patient:"
        # drop the index if it already exists
        try:
            self.client.execute_command("FT.DROPINDEX", self.INDEX_NAME)
            print("Dropped Index")
        except:
            pass
        self.create_index()

    def create_index(self):
        schema = (
            TagField("case_submitter_id"),
            VectorField(
                "embedding",
                "FLAT",
                {
                    "TYPE": "FLOAT64",
                    "DIM": 512 * 1024,
                    "DISTANCE_METRIC": "COSINE",
                },
            ),
        )
        definition = IndexDefinition(
            prefix=[self.DOC_PREFIX], index_type=IndexType.HASH
        )
        self.client.ft(self.INDEX_NAME).create_index(
            fields=schema, definition=definition
        )

    def add_document(self, case_id, embedding):
        embedding = embedding.astype(np.float64).tobytes()
        pipe = self.client.pipeline()
        pipe.hset(
            self.DOC_PREFIX + case_id,
            mapping={"case_submitter_id": case_id},
        )
        pipe.hset(
            self.DOC_PREFIX + case_id,
            mapping={"embedding": embedding},
        )
        pipe.execute()

    def query(self, query):
        result = np.frombuffer(self.client.hget(query, "embedding"), dtype=np.float64)
        return result


def main():
    minds = MINDS()
    clinical_model = ClinincalEmbeddings(model="UFNLP/gatortron-base")
    redis_client = RedisClient()

    project_id = "TCGA-LUAD"
    filename = project_id
    tables = minds.get_tables()

    if Path(f"{filename}.json").exists():
        json_objects = json.load(open(f"{filename}.json"))
    else:
        json_objects = {}
        for table in tqdm(tables["Tables_in_nihnci"]):
            query = f"SELECT * FROM nihnci.{table} WHERE project_id='{project_id}'"
            df = minds.query(query)
            for case_id, group in tqdm(df.groupby("case_submitter_id"), leave=False):
                if case_id not in json_objects:
                    json_objects[case_id] = {}
                common_fields, nested_objects = process_group(group)
                json_objects[case_id].update(common_fields)
                json_objects[case_id][table] = nested_objects
        with open(f"{filename}.json", "w") as fp:
            json.dump(json_objects, fp, indent=4, default=convert_numpy)

    # if the redis already has the embeddings, for a given patient, then skip
    for case_id, patient_data in tqdm(json_objects.items()):
        if redis_client.client.exists(f"patient:{case_id}"):
            continue
        summary = generate_summary_from_json(patient_data)
        embedding = clinical_model.generate_embeddings(summary)
        redis_client.add_document(case_id, embedding)

    random_case_id = random.choice(list(json_objects.keys()))
    patient_data = json_objects[random_case_id]
    summary = generate_summary_from_json(patient_data)
    print("\n", summary, "\n")

    result = redis_client.query(f"patient:{random_case_id}")
    print("\n", result.shape, "->", f"({result.shape[0] // 1024}, 1024)")


if __name__ == "__main__":
    main()

Using device: cuda


100%|██████████| 585/585 [00:44<00:00, 13.04it/s]



 project id: TCGA-LUAD; case submitter id: TCGA-55-6971; source center: 22; state: released; normal tumor genotype snp match: Yes; age at index: 59.0; days to birth: -21734.0; ethnicity: not reported; gender: female; race: white; vital status: Alive; year of birth: 1951.0; age at diagnosis: 21734.0; ajcc pathologic m: MX; ajcc pathologic n: N0; ajcc pathologic stage: Stage IB; ajcc pathologic t: T2; ajcc staging system edition: 7th; classification of tumor: not reported; days to diagnosis: 0.0; days to last follow up: 1400.0; icd 10 code: C34.3; last known disease status: not reported; morphology: 8140/3; primary diagnosis: Adenocarcinoma, NOS; prior malignancy: no; prior treatment: No; progression or recurrence: not reported; site of resection or biopsy: Lower lobe, lung; synchronous malignancy: No; tissue or organ of origin: Lower lobe, lung; tumor grade: not reported; year of diagnosis: 2010.0; treatment or therapy: no; alcohol history: Not Reported; cigarettes per day: 3.287671232

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


class ClinicalEmbeddings:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.to(device)

    def generate_embeddings(self, text):
        inputs = self.tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(
            dim=1
        )  # Return mean pooling of the token embeddings


class MultimodalDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.clinical_embeddings = ClinicalEmbeddings("UFNLP/gatortron-base")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        clinical_text = self.data[idx]
        embeddings = self.clinical_embeddings.generate_embeddings(clinical_text)
        return embeddings


# Step 1: Define the Transformer Encoder Model
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, num_heads, hidden_dim, num_layers):
        super(TransformerEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.encoder_layer, num_layers=num_layers
        )

    def forward(self, x):
        return self.transformer_encoder(x)


# Example parameters for the transformer
input_dim = 1024  # Size of each input embedding
num_heads = 8
hidden_dim = 2048
num_layers = 6

model = TransformerEncoder(input_dim, num_heads, hidden_dim, num_layers)
model.to(device)

# Example data (replace with actual clinical summaries)

for i in range(10):
    dummy_data = ["This is a dummy clinical summary"] * 32

dataset = MultimodalDataset(dummy_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
target = torch.rand(32, 1024)  # Replace with actual target variable
target = target.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()  # Adjust the criterion based on your specific task

num_epochs = 10
for epoch in range(num_epochs):
    for embeddings in dataloader:
        optimizer.zero_grad()
        output = model(embeddings)
        # Assuming you have a target variable corresponding to each embedding
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Save or use your model as needed

cuda


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1 Loss: 1.3388501405715942
Epoch 2 Loss: 1.2981369495391846
Epoch 3 Loss: 1.2753379344940186
Epoch 4 Loss: 1.2600343227386475
Epoch 5 Loss: 1.2499806880950928
Epoch 6 Loss: 1.2428406476974487
Epoch 7 Loss: 1.2376083135604858
Epoch 8 Loss: 1.2342753410339355
Epoch 9 Loss: 1.2320477962493896
Epoch 10 Loss: 1.2306965589523315


---

In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW

model_name = "bert-base-uncased"  # Example model, replace with your choice
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)


class EmbeddingAdjuster(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(EmbeddingAdjuster, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)


# Example: Adjust from 1024 to 768 (BERT's hidden size for the base model)
adjuster = EmbeddingAdjuster(1024, 768)
adjuster.to(device)


class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings  # embeddings should be a list of tensors
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        # Add sequence length dimension (assuming embedding is a single token)
        embedding = embedding.unsqueeze(0)
        return embedding, self.labels[idx]


# Example usage
# embeddings = [mapped_embedding1, mapped_embedding2, ...]
# labels = [label1, label2, ...]
# Assuming embeddings is a list of 1D tensors each of shape [1024]
embeddings = [torch.rand(1024) for _ in range(8)]  # Example embeddings
labels = torch.tensor([0, 1, 0, 1, 0, 1, 0, 1])  # Corresponding labels
dataset = EmbeddingDataset(embeddings, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

optimizer = AdamW(list(model.parameters()) + list(adjuster.parameters()), lr=5e-5)
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        embeddings, labels = batch
        embeddings, labels = embeddings.to(device), labels.to(device)
        # Adjust embeddings to match BERT's input size
        adjusted_embeddings = adjuster(embeddings)
        optimizer.zero_grad()
        outputs = model(inputs_embeds=adjusted_embeddings, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item()}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Loss: 0.860691487789154
Epoch 2 Loss: 0.65171879529953
Epoch 3 Loss: 0.7139180898666382
Epoch 4 Loss: 0.6716339588165283
Epoch 5 Loss: 0.6878353953361511
Epoch 6 Loss: 0.7487881183624268
Epoch 7 Loss: 0.7014307975769043
Epoch 8 Loss: 0.7196025848388672
Epoch 9 Loss: 1.1362229585647583
Epoch 10 Loss: 0.7736325263977051


---

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from lib.MINDS import MINDS
import pandas as pd
import json
from tqdm import tqdm
from pathlib import Path
from transformers import AutoModel, AutoTokenizer, AutoConfig
import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

#  Setting up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

def generate_summary_from_json(patient_data):
    # Initialize an empty list to store sentences
    summary_sentences = []

    # Iterate through each key-value pair in the JSON object
    for key, value in patient_data.items():
        # if the key is "case_id" then skip it
        if key == "case_id" or key == "pathology_report_uuid":
            continue

        # remove all _ from the key
        key = key.replace("_", " ")
        sentence = f"{key}: {value};"

        # if the value is a list, then skip it
        if isinstance(value, list):
            continue

        summary_sentences.append(sentence)

    # Compile all sentences into a single summary string
    summary = " ".join(summary_sentences)

    return summary

def process_group(group):
    common_fields = {}
    nested_objects = []
    for col in group.columns:
        unique_values = group[col].dropna().unique()
        if len(unique_values) == 1:
            # If only one unique value exists, it's a common field
            common_fields[col] = unique_values[0]

    # Create nested objects for fields that are not common
    for idx, row in group.iterrows():
        nested_object = {
            col: row[col]
            for col in group.columns
            if col not in common_fields and pd.notna(row[col])
        }
        if nested_object:  # Only add if the nested object is not empty
            nested_objects.append(nested_object)

    return common_fields, nested_objects

def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

class ClinincalEmbeddings:
    def __init__(self, model):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.config = AutoConfig.from_pretrained(model)
        self.model = AutoModel.from_pretrained(model)
        self.model = self.model.to(device)

    def generate_embeddings(self, sentences):
        inputs = self.tokenizer(
            str(sentences),
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)

        embedding = outputs["last_hidden_state"].cpu().numpy()
        return embedding

class RedisClient:
    def __init__(self):
        self.client = redis.Redis(host="10.0.1.15", port=6379, db=0)
        self.client.ping()
        self.INDEX_NAME = "clinical"
        self.DOC_PREFIX = "patient:"
        # drop the index if it already exists
        try:
            self.client.execute_command("FT.DROPINDEX", self.INDEX_NAME)
            print("Dropped Index")
        except:
            pass
        self.create_index()

    def create_index(self):
        schema = (
            TagField("case_submitter_id"),
            VectorField(
                "embedding",
                "FLAT",
                {
                    "TYPE": "FLOAT64",
                    "DIM": 512 * 1024,
                    "DISTANCE_METRIC": "COSINE",
                },
            ),
        )
        definition = IndexDefinition(
            prefix=[self.DOC_PREFIX], index_type=IndexType.HASH
        )
        self.client.ft(self.INDEX_NAME).create_index(
            fields=schema, definition=definition
        )

    def add_document(self, case_id, embedding):
        embedding = embedding.astype(np.float64).tobytes()
        pipe = self.client.pipeline()
        pipe.hset(
            self.DOC_PREFIX + case_id,
            mapping={"case_submitter_id": case_id},
        )
        pipe.hset(
            self.DOC_PREFIX + case_id,
            mapping={"embedding": embedding},
        )
        pipe.execute()

    def query(self, query):
        result = np.frombuffer(self.client.hget(query, "embedding"), dtype=np.float64)
        return result

class EmbeddingsDataset(Dataset):
    def __init__(self, embeddings):
        """
        Constructor for the Dataset.
        Args:
        embeddings (list of numpy arrays): The embeddings generated from clinical reports.
        """
        self.embeddings = [torch.tensor(e, dtype=torch.float32) for e in embeddings]

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.embeddings)

    def __getitem__(self, idx):
        """
        Retrieves the item (embedding) at the specified index.
        Args:
        idx (int): The index of the item to retrieve.
        Returns:
        Tensor: The embedding at the specified index.
        """
        return self.embeddings[idx].to(device)
    
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, n_heads, dropout_rate):
        """
        Constructor for the TransformerModel.
        Args:
        input_dim (int): Dimension of the input embeddings.
        hidden_dim (int): The dimension of the hidden layer.
        n_layers (int): Number of transformer layers.
        n_heads (int): Number of heads in the MultiHeadAttention layer.
        dropout_rate (float): Dropout rate.
        """
        super(TransformerModel, self).__init__()
        # Transformer layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, 
            nhead=n_heads, 
            dim_feedforward=hidden_dim, 
            dropout=dropout_rate
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

    def forward(self, x):
        """
        Forward pass of the model.
        Args:
        x (Tensor): Input tensor (embeddings).
        Returns:
        Tensor: The output of the transformer encoder.
        """
        transformed = self.transformer_encoder(x)
        return transformed

def train_model(model, criterion, optimizer, dataloader, num_epochs=10):
    """
    Trains the model.
    Args:
    model (nn.Module): The model to train.
    criterion (nn.Module): The loss function.
    optimizer (torch.optim): The optimizer to use.
    dataloader (DataLoader): The DataLoader object.
    num_epochs (int): Number of epochs to train the model for.
    """
    for epoch in range(num_epochs):
        for i, embeddings in enumerate(dataloader):
            # Forward pass
            outputs = model(embeddings)
            # Compute the loss
            loss = criterion(outputs, embeddings)
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


def main():
    minds = MINDS()
    clinical_model = ClinincalEmbeddings(model="UFNLP/gatortron-base")
    redis_client = RedisClient()

    project_id = "TCGA-LUAD"
    filename = project_id
    tables = minds.get_tables()

    if Path(f"{filename}.json").exists():
        json_objects = json.load(open(f"{filename}.json"))
    else:
        json_objects = {}
        for table in tqdm(tables["Tables_in_nihnci"]):
            query = f"SELECT * FROM nihnci.{table} WHERE project_id='{project_id}'"
            df = minds.query(query)
            for case_id, group in tqdm(df.groupby("case_submitter_id"), leave=False):
                if case_id not in json_objects:
                    json_objects[case_id] = {}
                common_fields, nested_objects = process_group(group)
                json_objects[case_id].update(common_fields)
                json_objects[case_id][table] = nested_objects
        with open(f"{filename}.json", "w") as fp:
            json.dump(json_objects, fp, indent=4, default=convert_numpy)

    # if the redis already has the embeddings, for a given patient, then skip
    for case_id, patient_data in tqdm(json_objects.items()):
        if redis_client.client.exists(f"patient:{case_id}"):
            continue
        summary = generate_summary_from_json(patient_data)
        embedding = clinical_model.generate_embeddings(summary)
        redis_client.add_document(case_id, embedding)

    embeddings = []
    for case_id, patient_data in tqdm(json_objects.items()):
        embedding = redis_client.query(f"patient:{case_id}")
        embedding = embedding.reshape(512, 1024)
        embeddings.append(embedding)

    print(f'Number of embeddings: {len(embeddings)}')
    print(f'Embedding shape: {embeddings[0].shape}')

    # Create the dataset and DataLoader
    dataset = EmbeddingsDataset(embeddings)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Model parameters
    input_dim = 1024  # Embedding dimension
    hidden_dim = 2048  # Hidden dimension
    n_layers = 4  # Number of transformer layers
    n_heads = 8  # Number of heads in MultiHeadAttention
    dropout_rate = 0.1  # Dropout rate

    # Initialize the model and move it to the device
    model = TransformerModel(input_dim, hidden_dim, n_layers, n_heads, dropout_rate).to(device)
    # Loss function
    criterion = nn.MSELoss()
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # Train the model
    train_model(model, criterion, optimizer, dataloader, num_epochs=10)

if __name__ == '__main__':
    main()


Using device: cuda
Dropped Index


100%|██████████| 585/585 [00:01<00:00, 315.29it/s]
100%|██████████| 585/585 [00:23<00:00, 24.56it/s]


Number of embeddings: 585
Embedding shape: (512, 1024)




Epoch [1/10], Loss: 0.2970
Epoch [2/10], Loss: 0.1899
Epoch [3/10], Loss: 0.1197
Epoch [4/10], Loss: 0.0774
Epoch [5/10], Loss: 0.0464
Epoch [6/10], Loss: 0.0271
Epoch [7/10], Loss: 0.0201
Epoch [8/10], Loss: 0.0152
Epoch [9/10], Loss: 0.0125
Epoch [10/10], Loss: 0.0089
