In [89]:
# Download all the necessary libraries
import pandas as pd
import numpy as np
import json
import xml.etree.ElementTree as ET
import glob
import re
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer

In [90]:
# Download stopwords once
nltk.download("stopwords")
nltk.download("punkt_tab")  # Needed for Word2Vec tokenization
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [91]:
# Function to clean text (applied at the earliest point)
def clean_text(text):
    """Cleans text by converting to lowercase, removing punctuation, and removing stopwords."""
    if not isinstance(text, str) or pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text


In [92]:
patient_data = pd.read_csv('/content/patients.csv')

# Extract relevant attributes
patient_data = patient_data[['Id', 'BIRTHDATE', 'GENDER']]

# Convert birthdate to age
from datetime import datetime

def calculate_age(birthdate):
    birthdate = datetime.strptime(birthdate, "%Y-%m-%d")
    today = datetime.today()
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

In [93]:
patient_data['AGE'] = patient_data['BIRTHDATE'].apply(calculate_age)

# Ensure Id is clean and string type
patient_data['Id'] = patient_data['Id'].astype(str).str.strip()

diagnosis_data = pd.read_csv('/content/conditions.csv')
# Ensure Patient is clean and string type
diagnosis_data['PATIENT'] = diagnosis_data['PATIENT'].astype(str).str.strip()

# Aggregate conditions for each patient
patient_conditions = diagnosis_data.groupby('PATIENT')['DESCRIPTION'].apply(list).reset_index()

# Merge patient conditions
patient_data = patient_data.merge(patient_conditions, left_on='Id', right_on='PATIENT', how='left')

# Convert NaN conditions to empty lists
patient_data['DESCRIPTION'] = patient_data['DESCRIPTION'].apply(lambda x: x if isinstance(x, list) else [])


In [94]:
# Verify the merge
print(patient_data.head())
print(patient_data.isnull().sum())

output_file = "merged_patient_data.csv"
patient_data.to_csv(output_file, index=False)

print(f"CSV file saved as: {output_file}")

                                     Id   BIRTHDATE GENDER  AGE  \
0  30a6452c-4297-a1ac-977a-6a23237c7b46  1994-02-06      M   31   
1  34a4dcc4-35fb-6ad5-ab98-be285c586a4f  1968-08-06      M   56   
2  7179458e-d6e3-c723-2530-d4acfe1c2668  2008-12-21      M   16   
3  37c177ea-4398-fb7a-29fa-70eb3d673876  1994-01-27      F   31   
4  0fef2411-21f0-a269-82fb-c42b55471405  2019-07-27      M    5   

                                PATIENT  \
0  30a6452c-4297-a1ac-977a-6a23237c7b46   
1  34a4dcc4-35fb-6ad5-ab98-be285c586a4f   
2  7179458e-d6e3-c723-2530-d4acfe1c2668   
3  37c177ea-4398-fb7a-29fa-70eb3d673876   
4  0fef2411-21f0-a269-82fb-c42b55471405   

                                         DESCRIPTION  
0  [Housing unsatisfactory (finding), Received hi...  
1  [Serving in military service (finding), Receiv...  
2  [Medication review due (situation), Traumatic ...  
3  [Chronic intractable migraine without aura (di...  
4  [Medication review due (situation), Medication...  
Id      

In [95]:
# Step 1: Load and Clean Patient Data EARLY
patient_file = "merged_patient_data.csv"
patients_df = pd.read_csv(patient_file)

# Convert DESCRIPTION column to list if stored as a string
import ast
patients_df['DESCRIPTION'] = patients_df['DESCRIPTION'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Clean patient descriptions **before merging**
patients_df['DESCRIPTION'] = patients_df['DESCRIPTION'].apply(lambda desc: [clean_text(cond) for cond in desc])

# Merge conditions into a single cleaned text string per patient
patients_df['combined_conditions'] = patients_df['DESCRIPTION'].apply(lambda x: " ".join(x))

In [96]:
# Step 2: Load and Clean XML Trial Data EARLY
xml_files = glob.glob("/content/NCT*.xml")

def extract_criteria(trial_file):
    tree = ET.parse(trial_file)
    root = tree.getroot()

    # Extract eligibility criteria **and clean it immediately**
    eligibility_criteria = root.find(".//eligibility/criteria/textblock")
    eligibility_text = clean_text(eligibility_criteria.text) if eligibility_criteria is not None else ""

    # Extract and clean age range & gender requirements
    min_age = root.find(".//eligibility/minimum_age")
    max_age = root.find(".//eligibility/maximum_age")
    gender = root.find(".//eligibility/gender")

    min_age = clean_text(min_age.text) if min_age is not None else "0 years"
    max_age = clean_text(max_age.text) if max_age is not None else "100 years"
    gender = clean_text(gender.text) if gender is not None else "all"

    # Convert age range into numerical values
    def extract_age(age_text):
        return int(age_text.split()[0]) if "years" in age_text else 0

    min_age = extract_age(min_age)
    max_age = extract_age(max_age)

    # Extract inclusion and exclusion criteria (and clean them)
    inclusion_criteria = []
    exclusion_criteria = []
    parsing_exclusion = False

    for line in eligibility_text.split("\n"):
        line = clean_text(line.strip())  # Ensure each line is cleaned
        if "exclusion" in line:
            parsing_exclusion = True
        elif "inclusion" in line:
            parsing_exclusion = False
        elif line:
            if parsing_exclusion:
                exclusion_criteria.append(line)
            else:
                inclusion_criteria.append(line)

    return min_age, max_age, gender, inclusion_criteria, exclusion_criteria

In [97]:
# Strategy 1: Process String-Based Matching on Cleaned Data
def is_eligible(patient, min_age, max_age, gender, inclusion_criteria, exclusion_criteria):
    """Check if a patient is eligible for a trial based on cleaned conditions and criteria."""
    # Check age
    if not (min_age <= patient["AGE"] <= max_age):
        return False

    # Check gender
    if gender != "All" and patient["GENDER"] != gender:
        return False

    # Clean inclusion/exclusion criteria
    inclusion_criteria = [clean_text(inc) for inc in inclusion_criteria]
    exclusion_criteria = [clean_text(exc) for exc in exclusion_criteria]

    # Convert patient conditions to cleaned string
    patient_conditions = " ".join(patient["DESCRIPTION"]).lower()

    # Inclusion: At least one criterion should match
    if not any(any(word in patient_conditions for word in inc.split()) for inc in inclusion_criteria):
        return False  # No match

    # Exclusion: No disqualifying criteria should be present
    if any(exc in patient_conditions for exc in exclusion_criteria):
        return False  # Disqualified

    return True

In [100]:
# Strategy 1: Process each XML file
all_eligible_patients = []
if all_eligible_patients:
    final_eligible_patients_df = pd.concat(all_eligible_patients, ignore_index=True)
    final_eligible_patients_df = final_eligible_patients_df.groupby(["Id", "AGE", "GENDER"])["Trial_ID"].apply(list).reset_index()

    # Save to file
    output_file = "eligible_patients.csv"
    final_eligible_patients_df.to_csv(output_file, index=False)
    print(f"Eligible patients saved to {output_file}")
else:
    print("No eligible patients found. Skipping CSV export.")

No eligible patients found. Skipping CSV export.


In [101]:
# Strategy 2: Word2Vec Training on Cleaned Data
sentences = [word_tokenize(" ".join(desc)) for desc in patients_df['DESCRIPTION']]

# Add cleaned inclusion criteria for trials
for xml_file in xml_files:
    _, _, _, inclusion_criteria, _ = extract_criteria(xml_file)  # Extract only inclusion criteria
    if inclusion_criteria:
        sentences.append(word_tokenize(" ".join(inclusion_criteria)))  # Convert list to string before tokenizing

# Train Word2Vec on cleaned text
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.save("word2vec_patient_trials.model")

In [104]:
# Strategy 2: Compute Patient & Trial Embeddings
def get_w2v_embedding(text):
    words = word_tokenize(text.lower())
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)  # Use zero vector if no words found

# Compute embeddings for all patients and trials
patients_df["w2v_embedding"] = patients_df["combined_conditions"].apply(get_w2v_embedding)
trials_df["w2v_embedding"] = trials_df["text_cleaned"].apply(get_w2v_embedding)

# Convert embeddings to NumPy arrays
patient_embeddings = np.vstack(patients_df["w2v_embedding"].values)
trial_embeddings = np.vstack(trials_df["w2v_embedding"].values)

# Step 5: Compute Cosine Similarity
similarity_matrix = cosine_similarity(patient_embeddings, trial_embeddings)

# Set similarity threshold
SIMILARITY_THRESHOLD = 0.4  # Match the BERT threshold

# Find matches using NumPy filtering
patient_indices, trial_indices = np.where(similarity_matrix > SIMILARITY_THRESHOLD)



In [105]:
# Strategy 2: Construct the Output in eligible Format
matched_patients_w2v = [
    {
        "patientId": patients_df.iloc[p_idx]["Id"],
        "trialId": trials_df.iloc[t_idx]["Trial_ID"],
        "trialName": f"Trial {trials_df.iloc[t_idx]['Trial_ID']}",
        "eligibilityCriteriaMet": [f"{trials_df.iloc[t_idx]['Trial_ID']}[{similarity_matrix[p_idx, t_idx]:.4f}]"]
    }
    for p_idx, t_idx in zip(patient_indices, trial_indices)
]

In [106]:
# Strategy 2: Save Results in BERT Format
output_df_w2v = pd.DataFrame(matched_patients_w2v)
output_df_w2v.to_excel("word2vec_matched_patients.xlsx", index=False)

with open("word2vec_matched_patients.json", "w") as json_file:
    json.dump(matched_patients_w2v, json_file, indent=4)

print(f"Word2Vec-based matching completed. {len(matched_patients_w2v)} patient-trial pairs found.")

Word2Vec-based matching completed. 19398 patient-trial pairs found.


In [107]:
# Load all XML files
def extract_inclusion_criteria(trial_file):
    tree = ET.parse(trial_file)
    root = tree.getroot()
    text = root.find(".//eligibility/criteria/textblock")
    return word_tokenize(text.text.lower()) if text is not None else []

# Add inclusion criteria to Word2Vec training data
for xml_file in xml_files:
    inclusion_criteria = extract_inclusion_criteria(xml_file)
    if inclusion_criteria:
        sentences.append(inclusion_criteria)  # Include trial criteria in training data

In [108]:
# Strategy 3: Compute BERT Embeddings on Fully Cleaned Data
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

def get_bert_embedding(text):
    """Returns the BERT embedding of the given text as a NumPy array."""
    if not text:
        return np.zeros(bert_model.get_sentence_embedding_dimension())
    return bert_model.encode(text, convert_to_numpy=True)

# Compute embeddings on cleaned text
patients_df["embedding"] = patients_df["combined_conditions"].apply(get_bert_embedding)
trials_df["embedding"] = trials_df["text_cleaned"].apply(get_bert_embedding)

# Convert embeddings to NumPy arrays
patient_embeddings = np.vstack(patients_df["embedding"].values)
trial_embeddings = np.vstack(trials_df["embedding"].values)

In [109]:
# Strategy 3: Compute Similarity (Using Fully Cleaned Data)
similarity_matrix = cosine_similarity(patient_embeddings, trial_embeddings)

# Set similarity threshold
SIMILARITY_THRESHOLD = 0.4  # Adjusted from 0.6 to improve matching

# Find matches using NumPy filtering
patient_indices, trial_indices = np.where(similarity_matrix > SIMILARITY_THRESHOLD)

# Construct the matched patients list
matched_patients = [
    {
        "patientId": patients_df.iloc[p_idx]["Id"],
        "trialId": trials_df.iloc[t_idx]["Trial_ID"],
        "trialName": f"Trial {trials_df.iloc[t_idx]['Trial_ID']}",
        "eligibilityCriteriaMet": [f"{trials_df.iloc[t_idx]['Trial_ID']}[{similarity_matrix[p_idx, t_idx]:.4f}]"]
    }
    for p_idx, t_idx in zip(patient_indices, trial_indices)
]

In [110]:
# Strategy 3: Save Results
output_df = pd.DataFrame(matched_patients)
output_df.to_excel("bert_matched_patients.xlsx", index=False)

with open("bert_matched_patients.json", "w") as json_file:
    json.dump(matched_patients, json_file, indent=4)

print(f"BERT-based matching completed. {len(matched_patients)} patient-trial pairs found.")

BERT-based matching completed. 1763 patient-trial pairs found.
