In [1]:
# https://platform.openai.com/usage #Usage Costs
# https://cookbook.openai.com/examples/question_answering_using_embeddings
# https://cookbook.openai.com/examples/embedding_wikipedia_articles_for_search

In [2]:
import ast  # for converting embeddings saved as strings back to arrays
from IPython.display import Markdown, display #displays ChatGPT's response neatly and cleanly
import json # for configuration file
import numpy as np
from openai import OpenAI # for calling the OpenAI API
import os # for loading directory files
import pandas as pd  # for storing text and embeddings data
from PyPDF2 import PdfReader # for reading PDF files
import re # used for matching words and phrases for anonymization purposes
from scipy import spatial  # for calculating vector similarities for search
import tiktoken  # for counting tokens

In [3]:
# see here for other model options: https://platform.openai.com/docs/models
EMBEDDING_MODEL = "text-embedding-3-large"
GPT_MODEL = "gpt-4-turbo" #128,000 tokens

if(EMBEDDING_MODEL == "text-embedding-ada-002"):
    max_tokens = 32768
if(EMBEDDING_MODEL == "text-embedding-3-large"):
    max_tokens = 128000

embeddings_filename = "embeddings_" + EMBEDDING_MODEL + ".csv"

# Load private variables from configuration file

In [4]:
with open('config.json') as config_file:
    config = json.load(config_file)
    phi_words = config['phi_words']
    ehr_directory = config['ehr_directory']
    api_key = config['api_key']

# Get list of EHR files

In [5]:
def list_filenames(directory):
    """Lists all filenames in the given directory."""
    # Ensure the directory exists
    if not os.path.exists(directory):
        return "Directory does not exist."
    
    # List all files in the directory
    filenames = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
    return filenames

# Get list of EHR filenames in EHR directory
ehr_filenames = list_filenames(ehr_directory)

# Load previously embedded EHR files

In [6]:
#Load previous paragraphs that have already been embedded if they exist, otherwise create a new dataframe with the correctly formatted columns
if os.path.exists(embeddings_filename):
    df_old_embeddings = pd.read_csv(embeddings_filename)    
else:
    df_old_embeddings = pd.DataFrame(columns=['text','embedding'])

#convert the previously embedded to a list
already_embedded = df_old_embeddings['text'].to_list()

# Extract, anonymize and embed text from new EHR files

In [7]:
def anonymize_text(text, phi_words):
    """Anonymize text from phi_words loaded from configuration file"""
    # Sort phi_words by length in descending order to avoid partial replacements
    phi_words = sorted(phi_words, key=len, reverse=True)
    for word in phi_words:
        # Use regex to match the exact word or phrase
        pattern = re.compile(re.escape(word), re.IGNORECASE)
        text = pattern.sub('', text)
    return text

In [8]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [9]:
def truncated_string(string: str, model: str, max_tokens: int, print_warning: bool = True) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    truncated_string = encoding.decode(encoded_string[:max_tokens])
    if print_warning and len(encoded_string) > max_tokens:
        print(f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.")
    return truncated_string

In [10]:
def halved_by_delimiter(string: str, delimiter: str = "\n") -> list[str, str]:
    """Split a string in two, on a delimiter, trying to balance tokens on each side."""
    chunks = string.split(delimiter)
    if len(chunks) == 1:
        return [string, ""]  # no delimiter found
    elif len(chunks) == 2:
        return chunks  # no need to search for halfway point
    else:
        total_tokens = num_tokens(string)
        halfway = total_tokens // 2
        best_diff = halfway
        for i, chunk in enumerate(chunks):
            left = delimiter.join(chunks[: i + 1])
            left_tokens = num_tokens(left)
            diff = abs(halfway - left_tokens)
            if diff >= best_diff:
                break
            else:
                best_diff = diff
        left = delimiter.join(chunks[:i])
        right = delimiter.join(chunks[i:])
        return [left, right]

In [11]:
def split_paragraphs_into_sections(paragraph: str, max_tokens: int = 1000, model: str = GPT_MODEL, max_recursion: int = 5) -> list[str]:
    """
    Split a paragraph into a list of subsections no longer than max_tokens.
    Each subsection is a tuple of parent titles [H1, H2, ...] and text (str).
    """    
    num_tokens_in_string = num_tokens(paragraph)
    
    # if length is fine, return string
    if num_tokens_in_string <= max_tokens:
        return [paragraph]
    # if recursion hasn't found a split after X iterations, just truncate
    elif max_recursion == 0:
        return [truncated_string(paragraph, model=model, max_tokens=max_tokens)]
    # otherwise, split in half and recurse
    else:
        # titles, text = subsection
        for delimiter in ["\n\n", "\n", ". "]:
            left, right = halved_by_delimiter(paragraph, delimiter=delimiter)
            if left == "" or right == "":
                # if either half is empty, retry with a more fine-grained delimiter
                continue
            else:
                # recurse on each half
                results = []
                for half in [left, right]:
                    # half_subsection = (titles, half)
                    half_strings = split_paragraphs_into_sections(half, max_tokens=max_tokens, model=model, max_recursion=max_recursion - 1)
                    results.extend(half_strings)
                return results
    # otherwise no split was found, so just truncate (should be very rare)
    return [truncated_string(paragraph, model=model, max_tokens=max_tokens)]

In [12]:
all_text_sections = []
# gather text from each EHR file in the EHR folder
for filename in ehr_filenames:
    reader = PdfReader(os.path.join(ehr_directory, filename))

    # Load all pages in pdf file
    for page in reader.pages:

        # get the text from each individual page
        text = page.extract_text()

        # split into paragraphes from each page of text
        paragraphs = text.split("\n\n")

        # run through every paragraph
        for paragraph in paragraphs:

            # check to see if every paragraph is within the model's token limit
            paragraph_splits = split_paragraphs_into_sections(paragraph, max_tokens)

            # run through every paragraph or section of paragraph
            for paragraph_split in paragraph_splits:

                # anonymize the text
                anonymized_line = anonymize_text(paragraph_split, phi_words)

                # if the anonymized line isn't in the list of previously embeddeded texts add it to the list to be embedded
                if(anonymized_line not in already_embedded):
                    all_text_sections.append(anonymized_line)

In [13]:
# create openAI client for embeddings and later querying ChatGPT
client = OpenAI(api_key=api_key)
BATCH_SIZE = 1000

embeddings = []
for batch_start in range(0, len(all_text_sections), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = all_text_sections[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)

df_new_embeddings = pd.DataFrame({"text": all_text_sections, "embedding": embeddings})
df_embeddings = pd.concat([df_old_embeddings, df_new_embeddings])

In [14]:
#Save embeddings so that they do not have to be re-embedded for future uses
df_embeddings.to_csv(embeddings_filename, index=False)

In [15]:
# search function
def strings_ranked_by_relatedness(query: str, df: pd.DataFrame, relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y), top_n: int = 100) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding

    # Ensure the embeddings are in numeric format
    df['embedding'] = df['embedding'].apply(lambda emb: np.array(eval(emb)) if isinstance(emb, str) else np.array(emb))

    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [16]:
def query_message(query: str, df: pd.DataFrame, model: str, token_budget: int) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    input_texts, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Please answer questions as an expert in medicine.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for input_text in input_texts:
        if (
            num_tokens(message + input_text + question, model=model)
            > token_budget
        ):
            break
        else:
            message += input_text
    return message + question

In [17]:
def ask(query: str, df: pd.DataFrame = df_embeddings, model: str = GPT_MODEL, token_budget: int = 4096 - 500, print_message: bool = False) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "Please answer prompts as an expert in medicine."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [18]:
query = "Please summarize this patient's medical history and general state of health. What are the patient's most pressing medical concerns? What course of treatment do you recommened?"

In [19]:
response = ask(query)
display(Markdown(response))

**Summary of Medical History and General State of Health:**

The patient is a 70-year-old male with a complex medical history that includes:
- Chronic conditions such as psoriasis, hyperlipidemia, essential hypertension, and obstructive sleep apnea.
- History of skin cancer (basal cell carcinoma excised in 2013) and ongoing skin issues like benign nevus and seborrheic keratosis.
- Cardiovascular issues including coronary artery disease with stable angina, atrial tachycardia, and a history of carotid endarterectomy and coronary stent placements.
- Neurological concerns such as intractable chronic migraine and a history of cryptogenic stroke.
- Orthopedic issues including bilateral knee replacements and degenerative disc disease.

The patient's general state of health appears managed through multiple medications and regular medical follow-ups, though the complexity of his conditions requires careful monitoring and coordination of care.

**Most Pressing Medical Concerns:**
1. **Cardiovascular Health:** Given his history of coronary artery disease, atrial tachycardia, and carotid artery stenosis, maintaining cardiovascular health is crucial. His recent procedures like cardiac catheterization and EP study indicate active management but necessitate ongoing vigilance.
2. **Skin Cancer Surveillance:** With a history of basal cell carcinoma and ongoing new lesions of seborrheic keratosis, regular dermatological evaluations are essential to monitor for potential malignant changes.
3. **Psoriasis Management:** Chronic psoriasis requires ongoing treatment to manage symptoms and prevent flare-ups.
4. **Migraine Management:** Chronic migraines can significantly impact quality of life and require effective management strategies.

**Recommended Course of Treatment:**
1. **Cardiovascular Management:**
   - Continue current medications including aspirin, ezetimibe, and Repatha for lipid management.
   - Regular follow-up with a cardiologist to monitor heart function and adjust treatments as necessary.
   - Maintain lifestyle modifications including diet, exercise, and stress management.

2. **Dermatological Care:**
   - Regular skin examinations to monitor existing conditions and check for new lesions.
   - Continue with prescribed treatments for psoriasis and manage seborrheic keratosis as recommended by the dermatologist.
   - Emphasize the importance of photoprotection to reduce the risk of new skin cancers.

3. **Neurological Support:**
   - Optimize migraine management possibly through adjustments in medication or exploring new treatment options if current regimen is insufficient.
   - Monitor for neurological symptoms that could suggest vascular issues or stroke recurrence.

4. **General Health Maintenance:**
   - Continue with regular screenings and vaccinations as recommended for age and medical history.
   - Address lifestyle factors such as diet, exercise, and weight management to support overall health.
   - Regular follow-ups with primary care to coordinate care across specialties and adjust treatments as needed.

5. **Mental and Emotional Health:**
   - Consider regular assessments for mood and cognitive function, given the extensive medical history and potential stressors associated with chronic illness management.

This comprehensive approach should help manage the patient's complex health needs and maintain his quality of life. Regular communication between the patient and his healthcare providers is crucial to effectively address his medical concerns.

In [None]:
# Displays the 5 most related strings according to cosine similarity
strings, relatednesses = strings_ranked_by_relatedness(query, df_embeddings, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(Markdown(string))
    print("--------------------------------")
    print()
    print()

In [None]:
# Get number of tokens for all submitted strings
total = 0
for string in strings:
    print(num_tokens(string))
    total += num_tokens(string)
print(total)