In [123]:
import pandas as pd
import os
import zipfile

In [124]:
patient_data = pd.read_csv('/content/patients.csv')

# Extract relevant attributes
patient_data = patient_data[['Id', 'BIRTHDATE', 'GENDER']]

# Convert birthdate to age
from datetime import datetime

def calculate_age(birthdate):
    birthdate = datetime.strptime(birthdate, "%Y-%m-%d")
    today = datetime.today()
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

patient_data['AGE'] = patient_data['BIRTHDATE'].apply(calculate_age)

# Ensure Id is clean and string type
patient_data['Id'] = patient_data['Id'].astype(str).str.strip()

diagnosis_data = pd.read_csv('/content/conditions.csv')
# Ensure Patient is clean and string type
diagnosis_data['PATIENT'] = diagnosis_data['PATIENT'].astype(str).str.strip()

In [125]:
# we need to check whether they both have Patient_ID to ensure effective merging
# Check data types before merging
print("Patient data: ", patient_data.dtypes)
print("Diagnosis data: ", diagnosis_data.dtypes)

Patient data:  Id           object
BIRTHDATE    object
GENDER       object
AGE           int64
dtype: object
Diagnosis data:  START          object
STOP           object
PATIENT        object
ENCOUNTER      object
SYSTEM         object
CODE            int64
DESCRIPTION    object
dtype: object


In [126]:
# Aggregate conditions for each patient
patient_conditions = diagnosis_data.groupby('PATIENT')['DESCRIPTION'].apply(list).reset_index()

# Merge patient conditions
patient_data = patient_data.merge(patient_conditions, left_on='Id', right_on='PATIENT', how='left')

# Convert NaN conditions to empty lists
patient_data['DESCRIPTION'] = patient_data['DESCRIPTION'].apply(lambda x: x if isinstance(x, list) else [])

# Verify the merge
print(patient_data.head())
print(patient_data.isnull().sum())

                                     Id   BIRTHDATE GENDER  AGE  \
0  30a6452c-4297-a1ac-977a-6a23237c7b46  1994-02-06      M   31   
1  34a4dcc4-35fb-6ad5-ab98-be285c586a4f  1968-08-06      M   56   
2  7179458e-d6e3-c723-2530-d4acfe1c2668  2008-12-21      M   16   
3  37c177ea-4398-fb7a-29fa-70eb3d673876  1994-01-27      F   31   
4  0fef2411-21f0-a269-82fb-c42b55471405  2019-07-27      M    5   

                                PATIENT  \
0  30a6452c-4297-a1ac-977a-6a23237c7b46   
1  34a4dcc4-35fb-6ad5-ab98-be285c586a4f   
2  7179458e-d6e3-c723-2530-d4acfe1c2668   
3  37c177ea-4398-fb7a-29fa-70eb3d673876   
4  0fef2411-21f0-a269-82fb-c42b55471405   

                                         DESCRIPTION  
0  [Housing unsatisfactory (finding), Received hi...  
1  [Serving in military service (finding), Receiv...  
2  [Medication review due (situation), Traumatic ...  
3  [Chronic intractable migraine without aura (di...  
4  [Medication review due (situation), Medication...  
Id      

In [127]:
output_file = "merged_patient_data.csv"
patient_data.to_csv(output_file, index=False)

print(f"CSV file saved as: {output_file}")

CSV file saved as: merged_patient_data.csv


In [128]:
import ast

# Load patient data
patient_file = "merged_patient_data.csv"
patients_df = pd.read_csv(patient_file)

# Convert conditions from string to list if needed
patients_df['DESCRIPTION'] = patients_df['DESCRIPTION'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

print("Patient data loaded successfully!")


Patient data loaded successfully!


In [129]:
import xml.etree.ElementTree as ET
import glob

# Get XML files from NCT00000102.xml to NCT00000300.xml
xml_files = glob.glob("/content/NCT*.xml")

# Function to extract criteria from XML
def extract_criteria(trial_file):
    tree = ET.parse(trial_file)
    root = tree.getroot()

    # Extract eligibility criteria
    eligibility_criteria = root.find(".//eligibility/criteria/textblock").text if root.find(".//eligibility/criteria/textblock") is not None else ""

    # Extract age range and gender requirements
    min_age = root.find(".//eligibility/minimum_age").text if root.find(".//eligibility/minimum_age") is not None else "0 Years"
    max_age = root.find(".//eligibility/maximum_age").text if root.find(".//eligibility/maximum_age") is not None else "100 Years"
    gender = root.find(".//eligibility/gender").text if root.find(".//eligibility/gender") is not None else "All"

    # Convert age range into numerical values
    def extract_age(age_text):
        return int(age_text.split()[0]) if "Years" in age_text else 0

    min_age = extract_age(min_age)
    max_age = extract_age(max_age)

    # Extract inclusion and exclusion criteria separately
    inclusion_criteria = []
    exclusion_criteria = []
    parsing_exclusion = False

    for line in eligibility_criteria.split("\n"):
        line = line.strip()
        if "Exclusion" in line:
            parsing_exclusion = True
        elif "Inclusion" in line:
            parsing_exclusion = False
        elif line:
            if parsing_exclusion:
                exclusion_criteria.append(line)
            else:
                inclusion_criteria.append(line)

    return min_age, max_age, gender, inclusion_criteria, exclusion_criteria

print("Clinical trial extraction function is ready!")


Clinical trial extraction function is ready!


In [130]:
# Function to check if a patient is eligible
# String-based matching,
def is_eligible(patient, min_age, max_age, gender, inclusion_criteria, exclusion_criteria):
    # Check age eligibility
    if not (min_age <= patient["AGE"] <= max_age):
        return False

    # Check gender eligibility
    if gender != "All" and patient["GENDER"] != gender:
        return False

    # Convert patient conditions into a single lowercase string
    patient_conditions = " ".join(patient["DESCRIPTION"]).lower()

    # **Looser matching**: If any inclusion criterion **partially matches**, accept the patient
    if not any(any(word in patient_conditions for word in inc.lower().split()) for inc in inclusion_criteria):
        return False  # Patient does not meet ANY inclusion criteria

    # **Exclusion criteria**: Ensure no disqualifying conditions are present
    if any(exc.lower() in patient_conditions for exc in exclusion_criteria):
        return False  # Patient has an exclusion condition

    return True

In [131]:
# Process each XML file
all_eligible_patients = []
for xml_file in xml_files:
    min_age, max_age, gender, inclusion_criteria, exclusion_criteria = extract_criteria(xml_file)

    eligible_patients_df = patients_df[
        patients_df.apply(lambda p: is_eligible(p, min_age, max_age, gender, inclusion_criteria, exclusion_criteria), axis=1)
    ]

    if not eligible_patients_df.empty:  # Only modify if not empty
        eligible_patients_df = eligible_patients_df.copy()  # Ensure safe modification
        eligible_patients_df.loc[:, "Trial_ID"] = xml_file.split("/")[-1].replace(".xml", "")
        all_eligible_patients.append(eligible_patients_df)  # Append only non-empty results


# Combine results from all trials
final_eligible_patients_df = pd.concat(all_eligible_patients, ignore_index=True)

final_eligible_patients_df = final_eligible_patients_df.groupby(["Id", "AGE", "GENDER"])["Trial_ID"].apply(list).reset_index()

# Save eligible patients to a new file
output_file = "eligible_patients.csv"
final_eligible_patients_df.to_csv(output_file, index=False)

print(f"Eligible patients saved to {output_file}")

Eligible patients saved to eligible_patients.csv


In [132]:
# Now I'm gonna use Word2Vec to calculate similarity scores
!pip install gensim
import glob
import xml.etree.ElementTree as ET
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import nltk

nltk.download('punkt_tab')  # Ensure word tokenization works



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [133]:
# Load patient data
patient_file = "merged_patient_data.csv"
patients_df = pd.read_csv(patient_file)

# Convert condition descriptions from strings to lists
patients_df['DESCRIPTION'] = patients_df['DESCRIPTION'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Tokenize descriptions (Word2Vec requires tokenized text)
sentences = [word_tokenize(" ".join(desc)) for desc in patients_df['DESCRIPTION']]

In [134]:
# Load all XML files
def extract_inclusion_criteria(trial_file):
    tree = ET.parse(trial_file)
    root = tree.getroot()
    text = root.find(".//eligibility/criteria/textblock")
    return word_tokenize(text.text.lower()) if text is not None else []

# Add inclusion criteria to Word2Vec training data
for xml_file in xml_files:
    inclusion_criteria = extract_inclusion_criteria(xml_file)
    if inclusion_criteria:
        sentences.append(inclusion_criteria)  # Include trial criteria in training data

In [135]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the model for future use
w2v_model.save("word2vec_patient_trials.model")

In [136]:
# Function to compute similarity score between two texts using Word2Vec
def compute_similarity(patient_conditions, trial_criteria):
    patient_tokens = word_tokenize(" ".join(patient_conditions).lower())
    trial_tokens = word_tokenize(" ".join(trial_criteria).lower())

    # Get vector representations (ignore words not in vocab)
    patient_vectors = [w2v_model.wv[word] for word in patient_tokens if word in w2v_model.wv]
    trial_vectors = [w2v_model.wv[word] for word in trial_tokens if word in w2v_model.wv]

    if not patient_vectors or not trial_vectors:
        return 0  # No meaningful comparison possible

    # Compute average vector for both
    patient_avg_vector = np.mean(patient_vectors, axis=0)
    trial_avg_vector = np.mean(trial_vectors, axis=0)

    # Compute cosine similarity
    similarity = np.dot(patient_avg_vector, trial_avg_vector) / (
        np.linalg.norm(patient_avg_vector) * np.linalg.norm(trial_avg_vector)
    )
    return similarity

In [137]:
!pip install openpyxl



In [138]:
# Set similarity threshold (0.6 is moderate, 0.8 is strong)
SIMILARITY_THRESHOLD = 0.6

# Create a list to store eligible patients
matched_patients = []

# Iterate over trials and match patients
for xml_file in xml_files:
    inclusion_criteria = extract_inclusion_criteria(xml_file)

    for _, patient in patients_df.iterrows():
        similarity_score = compute_similarity(patient["DESCRIPTION"], inclusion_criteria)

        if similarity_score >= SIMILARITY_THRESHOLD:
            matched_patients.append({
                "Patient_ID": patient["Id"],
                "Age": patient["AGE"],
                "Gender": patient["GENDER"],
                "Trial_ID": xml_file.split("/")[-1].replace(".xml", ""),
                "Similarity_Score": similarity_score
            })

# Convert to DataFrame
matched_patients_df = pd.DataFrame(matched_patients)

# Aggregate Trial_IDs with their respective Similarity Scores in the format "Trial_ID[similarity_score]"
aggregated_df = matched_patients_df.groupby(["Patient_ID", "Age", "Gender"]).apply(
    lambda x: [f"{trial_id}[{similarity_score:.4f}]" for trial_id, similarity_score in zip(x["Trial_ID"], x["Similarity_Score"])]
).reset_index(name="Trial_Matches")

# Save the results to an Excel file
aggregated_df.to_excel("word2vec_matched_patients.xlsx", index=False)

  aggregated_df = matched_patients_df.groupby(["Patient_ID", "Age", "Gender"]).apply(


In [139]:
# Now, we use BERT
!pip install transformers torch sentence-transformers scikit-learn



In [145]:
import json
import xml.etree.ElementTree as ET
import glob
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load a fast, efficient BERT-based model
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to get BERT embedding
def get_bert_embedding(text):
    """Returns the BERT embedding of the given text as a NumPy array."""
    if not isinstance(text, str) or pd.isna(text):
        return np.zeros(bert_model.get_sentence_embedding_dimension())
    return bert_model.encode(text, convert_to_numpy=True)

# Load patient data
patient_file = "merged_patient_data.csv"
patients_df = pd.read_csv(patient_file)

# Convert DESCRIPTION column to list if stored as a string
patients_df['DESCRIPTION'] = patients_df['DESCRIPTION'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Convert patient conditions into a single text string
patients_df['combined_conditions'] = patients_df['DESCRIPTION'].apply(lambda x: " ".join(x))

# Load all XML trial files
xml_files = glob.glob("/content/NCT*.xml")

# Function to extract eligibility criteria from XML
def extract_criteria(trial_file):
    tree = ET.parse(trial_file)
    root = tree.getroot()
    text = root.find(".//eligibility/criteria/textblock")
    return text.text if text is not None else ""

# Extract and store trial criteria
trials_data = []
for xml_file in xml_files:
    criteria_text = extract_criteria(xml_file)
    if criteria_text:
        trials_data.append({"Trial_ID": xml_file.split("/")[-1].replace(".xml", ""), "text_cleaned": criteria_text})

trials_df = pd.DataFrame(trials_data)

### 🔹 **Place Text Preprocessing Here (Before Encoding!)**
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    """Cleans text by converting to lowercase, removing punctuation, and removing stopwords."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply text cleaning
patients_df["combined_conditions"] = patients_df["combined_conditions"].apply(clean_text)
trials_df["text_cleaned"] = trials_df["text_cleaned"].apply(clean_text)

### Now Compute BERT Embeddings (On Cleaned Text)**
patients_df["embedding"] = patients_df["combined_conditions"].apply(get_bert_embedding)
trials_df["embedding"] = trials_df["text_cleaned"].apply(get_bert_embedding)

# Convert embeddings to NumPy arrays
patient_embeddings = np.vstack(patients_df["embedding"].values)
trial_embeddings = np.vstack(trials_df["embedding"].values)

# Compute cosine similarity
similarity_matrix = cosine_similarity(patient_embeddings, trial_embeddings)

# Set similarity threshold
SIMILARITY_THRESHOLD = 0.4  # Adjusted from 0.6 to improve matching

# Find matches using NumPy filtering
patient_indices, trial_indices = np.where(similarity_matrix > SIMILARITY_THRESHOLD)

# Construct the matched patients list
matched_patients = [
    {
        "patientId": patients_df.iloc[p_idx]["Id"],
        "trialId": trials_df.iloc[t_idx]["Trial_ID"],
        "trialName": f"Trial {trials_df.iloc[t_idx]['Trial_ID']}",
        "eligibilityCriteriaMet": [f"{trials_df.iloc[t_idx]['Trial_ID']}[{similarity_matrix[p_idx, t_idx]:.4f}]"]
    }
    for p_idx, t_idx in zip(patient_indices, trial_indices)
]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


BERT-based matching completed. 1763 patient-trial pairs found.


In [None]:
# Save results
output_df = pd.DataFrame(matched_patients)
output_df.to_excel("bert_matched_patients.xlsx", index=False)

with open("bert_matched_patients.json", "w") as json_file:
    json.dump(matched_patients, json_file, indent=4)

print(f"BERT-based matching completed. {len(matched_patients)} patient-trial pairs found.")