In [164]:
import os
import pandas as pd
from scripts.import_dataset import ImportDataset

In [165]:
prompt_folder = "prompts"
os.makedirs(prompt_folder, exist_ok=True)

In [166]:
data = ImportDataset()
df = data.read_dataset()
print(f"✅ Combined dataset size: {df.shape[0]} samples")

✅ Combined dataset size: 14438 samples


In [167]:
prompt = '''
You are a health professional, and you are classifying medical notes in one of the five categories: digestive system diseases, cardiovascular diseases, neoplasms, nervous system diseases, and general pathological conditions. 

Give downloadable output in CSV format, where the first column would be the note that I give as an input, and the second column will be one of the five categories associated with notes. Please do not write any explanations. 
'''


In [None]:
df_balanced = df.groupby('condition_label').sample(n=20, random_state=42)
df_filtered = df[~df.index.isin(df_balanced.index)]
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
#commented the line below. We do not want the csv file to change since it is manually passed to ChatGPT.
# df_balanced.to_csv(os.path.join("prompts", "Balanced_GPT_input.csv"))
df_balanced

Unnamed: 0,condition_label,medical_abstract
0,4,Future directions in vasodilator therapy for h...
1,2,The auditory P300 event-related potential: an ...
2,3,Multidisciplinary baseline assessment of homos...
3,2,Anal sphincter function after intersphincteric...
4,2,Recurrence of Crohn's disease after resection....
...,...,...
95,3,Septic arthritis of the C1-C2 lateral facet jo...
96,3,Experiential phenomena of temporal lobe epilep...
97,0,Acute water intoxication as a complication of ...
98,4,Biobehavioral factors in Cardiac Arrhythmia Pi...


In [169]:
folder = "Balanced"
folder_path = os.path.join(prompt_folder, folder)
os.makedirs(folder_path, exist_ok=True)

In [None]:
medical_notes = df_balanced['medical_abstract'].values
update_prompt = prompt + "\n\nClassify the following notes:\n"
for index, i in enumerate(range(0, 100, 10)):
    notes = []
    for idx, item in enumerate(medical_notes[i:i+10]):
        note = str(idx+1) + "." + item
        notes.append(note)
    new_prompt = update_prompt + "\n".join(notes)
    file_name = os.path.join(folder_path, str(index+1)+".csv")
    #commented the line below. We do not want the csv file to change since it is manually passed to ChatGPT.
    # with open(file_name, 'w', newline='') as f:
    #     f.write(new_prompt)

In [None]:
df_unbalanced = df_filtered.sample(n=100, random_state=42)
df_filtered = df_filtered[~df_filtered.index.isin(df_unbalanced.index)]
df_unbalanced = df_unbalanced.sample(frac=1, random_state=42).reset_index(drop=True)
#commented the line below. We do not want the csv file to change since it is manually passed to ChatGPT.
# df_unbalanced.to_csv(os.path.join("prompts", "Unbalanced_GPT_input.csv"))
df_unbalanced.groupby("condition_label").size()

condition_label
0    35
1    26
2     7
3     8
4    24
dtype: int64

In [172]:
folder = "Unbalanced"
folder_path = os.path.join(prompt_folder, folder)
os.makedirs(folder_path, exist_ok=True)

In [None]:
medical_notes = df_unbalanced['medical_abstract'].values
update_prompt = prompt + "\n\nClassify the following notes:\n"
for index, i in enumerate(range(0, 100, 10)):
    notes = []
    for idx, item in enumerate(medical_notes[i:i+10]):
        note = str(idx+1) + "." + item
        notes.append(note)
    new_prompt = update_prompt + "\n".join(notes)
    file_name = os.path.join(folder_path, str(index+1)+".csv")
    #commented the line below. We do not want the csv file to change since it is manually passed to ChatGPT.
    # with open(file_name, 'w', newline='') as f:
    #     f.write(new_prompt)

In [174]:
df_examples = df_filtered.groupby('condition_label').sample(n=2, random_state=42)
df_filtered = df_filtered[~df_filtered.index.isin(df_examples.index)]
df_examples

Unnamed: 0,condition_label,medical_abstract
11590,0,Pulmonary arterial and venous constriction dur...
3301,0,Pars plana vitrectomy for acute retinal detach...
13260,1,Resection and reconstruction for bone tumors i...
8532,1,Computed tomography-guided fine-needle aspirat...
9577,2,Simultaneous bilateral hernia repair. A case a...
8262,2,Mucosal injury and gamma-irradiation produce p...
2555,3,Liver failure occurring as a component of exer...
13265,3,Diabetic amyotrophy without pain. A puzzling c...
858,4,Fluosol: an oxygen-delivery fluid for use in p...
9263,4,Failure of transluminal angioplasty in the tre...


In [175]:
df_labels = pd.read_csv("data/medical_tc_labels.csv")
df_labels.loc[df_labels['condition_label'] == 5, 'condition_label'] = 0
df_examples = pd.merge(df_examples, df_labels, on='condition_label')

In [176]:

df_examples = df_examples[['medical_abstract', 'condition_name']]
example_prompt = '''
Use the examples to learn more about the five categories: digestive system diseases, cardiovascular diseases, neoplasms, nervous system diseases, 
and general pathological conditions. Here are the two examples of each categories:
'''
for index, row in df_examples.iterrows():
    notes = str(index + 1) + " Category: " + row["condition_name"] + ". Medical Notes: " + row['medical_abstract']
    example_prompt = example_prompt + "\n" + notes

new_prompt = prompt + " " + example_prompt + "\n\nClassify the following notes:\n"

In [None]:
df_unbalanced_with_example = df_filtered.sample(n=100, random_state=42)
df_filtered = df_filtered[~df_filtered.index.isin(df_unbalanced_with_example.index)]
df_unbalanced_with_example = df_unbalanced_with_example.sample(frac=1, random_state=42).reset_index(drop=True)
#commented the line below. We do not want the csv file to change since it is manually passed to ChatGPT.
# df_unbalanced_with_example.to_csv(os.path.join("prompts", "Unbalanced_GPT_input_with_examples.csv"))
df_unbalanced_with_example.groupby("condition_label").size()

condition_label
0    28
1    21
2    13
3    15
4    23
dtype: int64

In [178]:
folder = "Unbalanced_with_example"
folder_path = os.path.join(prompt_folder, folder)
os.makedirs(folder_path, exist_ok=True)

In [None]:
medical_notes = df_unbalanced_with_example['medical_abstract'].values
for index, i in enumerate(range(0, 100, 10)):
    notes = []
    for idx, item in enumerate(medical_notes[i:i+10]):
        note = str(idx+1) + "." + item
        notes.append(note)
    new_prompt_example = new_prompt + "\n".join(notes)
    file_name = os.path.join(folder_path, str(index+1)+".csv")
    #commented the line below. We do not want the csv file to change since it is manually passed to ChatGPT.
    # with open(file_name, 'w', newline='') as f:
    #     f.write(new_prompt_example)