In [1]:
#libraries
import openai
import os
import pandas as pd
import time
import csv
import re

In [None]:
#API key
openai.api_key = 'paste key here'

In [3]:
#import data
df = pd.read_csv('data.csv')
print(df["letter_text"].head())

0    Thank you for seeing this 60 year old woman wi...
1    I am referring this 37-year-old female patient...
2    Dear Colleague, This 72 yr old gentleman has h...
3    Thank you for seeing this 45F with symptomatic...
4    The doctor thank you for seeing this lady who ...
Name: letter_text, dtype: object


In [4]:
#remove urgency column
df = df.drop(columns = ["urgency"], inplace = False)

In [5]:
#seperate data into classes
pvd_letters = df[df['referal_type'] == "pvd"]
vv_letters = df[df['referal_type'] == "varicose veins"]
nv_letters = df[df['referal_type'] == "non vascular"]

In [None]:
# Function for prompts
def generate_synthetic_letters(letters_df, letter_type = 'nonvascular'):
    # Construct the examples section from given letters
    examples_text = "\n---\n".join([
        f"[Letter {i+1}]\n{row['letter_text']}"
        for i, row in letters_df.iterrows()
    ])

    if letter_type == 'pvd':
        # PVD prompt
        prompt = f"""
        You are a medical assistant tasked with generating synthetic referral letters for machine learning training.
        Below is a set of real, anonymized referral letters. Your task is to learn the writing style, structure, and medical content of these letters. Then generate new synthetic letters that resemble the same clinical tone and structure.
        REAL LETTERS:
        ---
        {examples_text}
        ---

        TASK:
        Generate 10 new synthetic referral letters that:
        - Reflect the letters in the examples
        - Use a formal, clinical tone
        - Mention symptoms, medical history, and examination findings
        - Vary the patient age and context realistically
        - Maintain medical plausibility and formal tone

        Your symptoms should include some of the following:
        - Cardiovascular issues such as:
            - Pulse
            - Blood pressure
            - Any ECGs done
        - Smoking History
        - Cholesterol levels
        - Diabetes
        - Any Medications

        Output only the 10 synthetic letters.
        each letter should be separated by "---" followed by Letter X where X is the letter number.
        """
    elif letter_type == 'vv':
        #Varicose veins prompt
        prompt = f"""
        You are a medical assistant tasked with generating synthetic referral letters for machine learning training.
        Below is a set of real, anonymized referral letters. Your task is to learn the writing style, structure, and medical content of these letters. Then generate new synthetic letters that resemble the same clinical tone and structure.
        REAL LETTERS:
        ---
        {examples_text}
        ---

        TASK:
        Generate 10 new synthetic referral letters that:
        - Reflect the letters in the examples
        - Use a formal, clinical tone
        - Mention symptoms, medical history, and examination findings
        - Vary the patient age and context realistically
        - Maintain medical plausibility and formal tone

        Your symptoms should include some of the following:
        - Bleeding
        - Ulceration (healing or not)
        - If the patient is a woman and under 50 if they have recently been pregnant in the last 2 years
        - Any aching pain persisting for more than 12 months with no improvements

        Output only the 10 synthetic letters.
        each letter should be separated by "---" followed by Letter X where X is the letter number.
        """
    else:
        prompt = f"""
        You are a medical assistant tasked with generating synthetic referral letters for machine learning training.
        Below is a set of real, anonymized referral letters. Your task is to learn the writing style, structure, and medical content of these letters. Then generate new synthetic letters that resemble the same clinical tone and structure. They must not be on strokes and should not be about any form of vascular ailment
        REAL LETTERS:
        ---
        {examples_text}
        ---

        TASK:
        Generate 10 new synthetic referral letters that:
        - Reflect the letters in the examples
        - Use a formal, clinical tone
        - Mention symptoms, medical history, and examination findings
        - Vary the patient age and context realistically
        - Maintain medical plausibility and formal tone

        Output only the 10 synthetic letters.
        each letter should be separated by "---" followed by Letter X where X is the letter number.
        """
    # Call the API
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful medical assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
    )

    return response.choices[0].message.content

In [7]:
def clean_and_split_letters(text, referral_type):
    # remove all occurrences of '---'
    text = text.replace('---', '').strip()

    #use letter prefix to determine the start of a letter
    letters = re.split(r'\[Letter \d+\]', text)
    letters = [l.strip() for l in letters if l.strip()]  # Remove empty strings
    cleaned_letters = []
    for letter in letters:
        # Remove newlines
        letter = letter.replace('\n', ' ')
        # Remove extra spaces
        letter = re.sub(r'\s+', ' ', letter).strip()
        # remove any leftover [Letter X] patterns
        letter = re.sub(r'\[Letter\s+[^\]]+\]', '', letter).strip()
        cleaned_letters.append(letter)
    return pd.DataFrame({
        'letter_text': cleaned_letters,
        'referal_type': [referral_type] * len(cleaned_letters)
    })

In [None]:
pvd_batches = []

for _ in range(10):  
    raw_text = generate_synthetic_letters(pvd_letters, 'pvd')
    
    # Convert the raw string into a DataFrame with one row per letter
    split_df = clean_and_split_letters(raw_text, referral_type='pvd')
    
    pvd_batches.append(split_df)

# Combine all batches into one DataFrame
pvd_synthetic = pd.concat(pvd_batches, ignore_index=True)


In [15]:
print(pvd_synthetic)

                                          letter_text referal_type
0   Dear Colleague, I am referring a 67-year-old f...          pvd
1   Thank you for seeing this 72-year-old male who...          pvd
2   Dear Doctor, I am referring a 63-year-old male...          pvd
3   Dear Colleague, this 70-year-old female presen...          pvd
4   Thank you for seeing this 69-year-old male who...          pvd
..                                                ...          ...
95  Dear Doctor, Thank you for seeing this 82-year...          pvd
96  Dear Colleague, I am referring a 64-year-old f...          pvd
97  Dear Doctor, Please evaluate this 71-year-old ...          pvd
98  Dear Colleague, Thank you for seeing this 68-y...          pvd
99  Dear Doctor, I am referring a 75-year-old fema...          pvd

[100 rows x 2 columns]


In [16]:
vv_batches = []

for _ in range(10):  
    raw_text = generate_synthetic_letters(vv_letters, 'vv')
    
    # Convert the raw string into a DataFrame with one row per letter
    split_df = clean_and_split_letters(raw_text, referral_type='varicose veins')
    
    vv_batches.append(split_df)

# Combine all batches into one DataFrame
vv_synthetic = pd.concat(vv_batches, ignore_index=True)


In [17]:
print(vv_synthetic)

                                          letter_text    referal_type
0   Dear Colleague, I am referring a 34-year-old w...  varicose veins
1   Dear Specialist, I am writing to seek your opi...  varicose veins
2   Dear Vascular Surgeon, Please see this 47-year...  varicose veins
3   Dear Colleague, This referral concerns a 62-ye...  varicose veins
4   Dear Vascular Specialist, I am referring a 29-...  varicose veins
..                                                ...             ...
95  Dear Specialist, I am urgently referring Ms. N...  varicose veins
96  Dear Vascular Team, I am referring a 48-year-o...  varicose veins
97  Dear Esteemed Colleague, I would like your exp...  varicose veins
98  Dear Vascular Surgeon, Please assess Mr. GHI, ...  varicose veins
99  Dear Colleague, I am referring a 62-year-old f...  varicose veins

[100 rows x 2 columns]


In [None]:
nv_batches = []

for _ in range(10):  
    raw_text = generate_synthetic_letters(nv_letters)
    
    # Convert the raw string into a DataFrame with one row per letter
    split_df = clean_and_split_letters(raw_text, referral_type='non vascular')
    
    nv_batches.append(split_df)

# Combine all batches into one DataFrame
nv_synthetic = pd.concat(nv_batches, ignore_index=True)


In [9]:
print(nv_synthetic)

                                         letter_text  referal_type
0  Dear Colleague, I am referring a 63-year-old f...  non vascular
1  Dear Colleague, This 47-year-old male presents...  non vascular
2  Dear doctor, A 38-year-old female reports freq...  non vascular
3  Dear Colleague, I am referring a 54-year-old m...  non vascular
4  Dear Colleague, This 29-year-old female presen...  non vascular
5  Dear doctor, I am referring a 50-year-old male...  non vascular
6  Dear Colleague, This 45-year-old female presen...  non vascular
7  Dear Colleague, A 60-year-old male with a hist...  non vascular
8  Dear doctor, I am referring a 34-year-old fema...  non vascular
9  Dear Colleague, This 72-year-old female presen...  non vascular


In [None]:
# Combine all synthetic DataFrames into one
combined_df = pd.concat([pvd_synthetic, vv_synthetic, nv_synthetic], ignore_index=True)



# Save the combined DataFrame to a CSV file
combined_df.to_csv('synthetic_data.csv', index=False)

In [None]:
nv_synthetic.to_csv('nv_synthetic.csv', index=False)