In [10]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import os
import numpy as np

In [2]:
# Load the dataset
file_name = "../filtered_combined.xlsx"
df = pd.read_excel(file_name)

In [3]:

# Detect if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# BioBERT model initialization
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to the device


In [5]:
output_file = "similar_trials_results.xlsx"
model_file = "biobert_embeddings.pt"

In [6]:
# Define a dataset class for tokenization
class ClinicalTrialsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data  # This should be a list of texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]  # Access list item directly
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {key: val.squeeze(0).to(device) for key, val in encoding.items()}  # Return tensors and move to device


In [7]:
# Generate embeddings using BioBERT
def generate_embeddings(texts, tokenizer, model, batch_size=16):
    dataset = ClinicalTrialsDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    embeddings = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating embeddings"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]

            # Move tensors to the device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())  # Move embeddings back to CPU for numpy conversion

    return torch.tensor(np.vstack(embeddings))

In [8]:
# Preprocess the text data
df["Combined_Text"] = df["Combined Column"].fillna("")
texts = df["Combined_Text"].tolist()

In [11]:
# Check if embeddings are already saved
if os.path.exists(model_file):
    embeddings = torch.load(model_file)
    print("Loaded embeddings from saved model.")
else:
    # Generate embeddings for all clinical trials
    embeddings = generate_embeddings(texts, tokenizer, model)
    torch.save(embeddings, model_file)
    print(f"Embeddings saved to {model_file}")

Generating embeddings: 100%|███████████████████████████████████████████████████████| 7374/7374 [19:26<00:00,  6.32it/s]


Embeddings saved to biobert_embeddings.pt


In [21]:
# Function to retrieve top N similar trials
def get_similar_trials(query_embedding, embeddings, top_n=10):
    # Ensure the tensors are moved to CPU before passing to cosine_similarity
    query_embedding_cpu = query_embedding.cpu().numpy()  # Move to CPU and convert to numpy
    embeddings_cpu = embeddings.cpu().numpy()  # Move to CPU and convert to numpy

    similarities = cosine_similarity(query_embedding_cpu, embeddings_cpu)
    similar_indices = similarities.argsort(axis=1)[:, -top_n-1:-1][:, ::-1]
    return similar_indices


In [22]:
# Trials to evaluate
evaluation_trials = ["NCT00385736", "NCT00386607", "NCT03518073"]

In [23]:
# Create a mapping of NCT IDs to indices
nct_id_to_index = {nct_id: idx for idx, nct_id in enumerate(df["nct_id"])}


In [24]:

# Output DataFrame for similar trials
output_writer = pd.ExcelWriter(output_file, engine="xlsxwriter")

In [25]:
# Generate similar trials for evaluation NCT IDs
for trial_id in evaluation_trials:
    if trial_id in nct_id_to_index:
        query_idx = nct_id_to_index[trial_id]
        query_embedding = embeddings[query_idx].unsqueeze(0).to(device)  # Move query embedding to device
        similar_indices = get_similar_trials(query_embedding, embeddings)

        similar_trials = df.iloc[similar_indices[0]]
        similar_trials["Similarity_Score"] = [
            cosine_similarity(query_embedding, embeddings[idx].unsqueeze(0).to(device)).item()
            for idx in similar_indices[0]
        ]

        # Save the results to an Excel sheet
        similar_trials.to_excel(output_writer, sheet_name=f"{trial_id}", index=False)


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_trials(query_embedding, embeddings, top_n=10):
    # Ensure both tensors are on the CPU before calling cosine_similarity
    query_embedding_cpu = query_embedding.cpu().detach().numpy()
    embeddings_cpu = embeddings.cpu().detach().numpy()

    # Compute cosine similarity between the query and all embeddings
    similarities = cosine_similarity(query_embedding_cpu, embeddings_cpu)
    
    # Get the indices of the top_n most similar trials (excluding the query itself)
    similar_indices = similarities.argsort(axis=1)[:, -top_n-1:-1][:, ::-1]
    
    return similar_indices

# Generate similar trials for evaluation NCT IDs
for trial_id in evaluation_trials:
    if trial_id in nct_id_to_index:
        query_idx = nct_id_to_index[trial_id]
        query_embedding = embeddings[query_idx].unsqueeze(0).to(device)  # Move query embedding to device
        
        # Get similar trial indices
        similar_indices = get_similar_trials(query_embedding, embeddings)

        # Retrieve the similar trials from the DataFrame
        similar_trials = df.iloc[similar_indices[0]]
        
        # Calculate and store similarity scores
        similar_trials["Similarity_Score"] = [
            cosine_similarity(query_embedding.cpu().detach().numpy().reshape(1, -1), embeddings[idx].cpu().detach().numpy().reshape(1, -1)).item()
            for idx in similar_indices[0]
        ]


        # Save the results to an Excel sheet
        similar_trials.to_excel(output_writer, sheet_name=f"{trial_id}", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Similarity_Score"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Similarity_Score"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Similarity_Score"] = [


In [29]:
# Save the output results
output_writer.save()
print(f"Similar trials saved to {output_file}")

AttributeError: 'XlsxWriter' object has no attribute 'save'

In [30]:
import pandas as pd

# Assuming similar_trials is your DataFrame
output_file = "similar_trials.xlsx"

# Using XlsxWriter engine
with pd.ExcelWriter(output_file, engine='xlsxwriter') as output_writer:
    similar_trials.to_excel(output_writer, index=False, sheet_name='Similar Trials')

# No need to call save() or close() explicitly when using 'with' context manager
print(f"Similar trials saved to {output_file}")


Similar trials saved to similar_trials.xlsx


In [31]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def get_similar_trials(query_embedding, embeddings, top_n=10):
    # Ensure both tensors are on the CPU before calling cosine_similarity
    query_embedding_cpu = query_embedding.cpu().detach().numpy()
    embeddings_cpu = embeddings.cpu().detach().numpy()

    # Compute cosine similarity between the query and all embeddings
    similarities = cosine_similarity(query_embedding_cpu, embeddings_cpu)
    
    # Get the indices of the top_n most similar trials (excluding the query itself)
    similar_indices = similarities.argsort(axis=1)[:, -top_n-1:-1][:, ::-1]
    
    return similar_indices

# Generate similar trials for evaluation NCT IDs
output_data = []  # List to collect the results for each NCT ID

for trial_id in evaluation_trials:
    if trial_id in nct_id_to_index:
        query_idx = nct_id_to_index[trial_id]
        query_embedding = embeddings[query_idx].unsqueeze(0).to(device)  # Move query embedding to device
        
        # Get similar trial indices
        similar_indices = get_similar_trials(query_embedding, embeddings)

        # Retrieve the similar trials from the DataFrame
        similar_trials = df.iloc[similar_indices[0]]
        
        # Calculate and store similarity scores
        similar_trials["Similarity_Score"] = [
            cosine_similarity(query_embedding.cpu().detach().numpy().reshape(1, -1), embeddings[idx].cpu().detach().numpy().reshape(1, -1)).item()
            for idx in similar_indices[0]
        ]
        
        # Add the NCT ID (trial_id) as a new column to track which trial it corresponds to
        similar_trials["Query_NCT_ID"] = trial_id
        
        # Append the results to the output list
        output_data.append(similar_trials)

# Combine all results into a single DataFrame
final_results = pd.concat(output_data, ignore_index=True)

# Save the results to an Excel sheet
output_file = "similar_trials_with_nct_id.xlsx"
with pd.ExcelWriter(output_file, engine='xlsxwriter') as output_writer:
    final_results.to_excel(output_writer, index=False, sheet_name='Similar Trials')

print(f"Similar trials with NCT IDs saved to {output_file}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Similarity_Score"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Query_NCT_ID"] = trial_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Similarity_Score"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

Similar trials with NCT IDs saved to similar_trials_with_nct_id.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Similarity_Score"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_trials["Query_NCT_ID"] = trial_id
