In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from Levenshtein import distance
from ast import literal_eval

In [4]:
def move_surgery(data):
    tkn = "Surgery"
    if tkn in data.intents:
        data.intents.remove(tkn)
        if "Other Treatments" not in data.intents:
            data.intents.append("Other Treatments")
    return data

def prepare_next_intent_prediction_data(sample, num_prev, source, intents_only):
    all_turns = source[source["encounter_id"] == sample.encounter_id]
    text = ""
    #print(sample.name)
    if num_prev != 0:
        for i in reversed(range(num_prev)):
            if (sample.text_id-(i+1)) >= 0:
                prev_turn = all_turns[all_turns["text_id"] == (sample.text_id-(i+1))]
                if not prev_turn.empty:
                    prev_turn_text = prev_turn.text.item().replace("doctor:", "[Doctor]").replace("patient:", "[Patient]").replace("patient_guest:", "[Patient Guest]")
                    for intent in prev_turn.intents.item():
                        text += "[" + intent + "]"
                    if not intents_only:
                        text += prev_turn_text + "[SEP]"
                    else:
                        text += "[SEP]"
            else:
                
                if "[Conversation Start]" not in text:
                    text += "[Conversation Start][SEP]"
    else:
 
        for i in range(sample.text_id):
            if i == 0:
                text += "[Conversation Start]"
            prev_turn = all_turns[all_turns["text_id"] == i]
            if not prev_turn.empty:
                prev_turn_text = prev_turn.text.item().replace("doctor:", "[Doctor]").replace("patient:", "[Patient]").replace("patient_guest:", "[Patient Guest]")
                for intent in prev_turn.intents.item():
                    text += "[" + intent + "]"
                if not intents_only:
                    text += prev_turn_text + "[SEP]"

    return text if text != "" else None
def map_speciality(sample, specialities_by_id):
    for speciality in specialities_by_id.iterrows():
        if sample.encounter_id in literal_eval(speciality[1][1]):
            spec = speciality[1][0]
    return spec

def map_sections(sample):
    subj = ["Acute Symptoms", "Personal History", "Drug History", "Vegetative History", "Other Socials", "Family History", "Greetings", "Therapeutic History"]
    obj = ["Physical Examination", "Lab Examination", "Radiology Examination"]
    ass =["Acute Assessment", "Reassessment"]
    plan = ["Discussion", "Referral", "Medication", "Follow-up", "Other Treatments", "Diagnostic Testing"]  
    null = ["Chitchat"]
    sections = []
    for intent in sample.intents:
        if intent in subj:
            sections.append("Subjective")
        elif intent in obj:
            sections.append("Objective")
        elif intent in plan:
            sections.append("Plan")
        elif intent in ass:
            sections.append("Assessment")
        elif intent in null:
            sections.append("Null")
        else:
            print(f"Error: {sample.name}")
    return list(set(sections))


In [None]:
train = pd.read_json("")
train["sections"] = train.apply(map_sections, axis=1)
val = pd.read_json("")
val["sections"] = val.apply(map_sections, axis=1)
test = pd.read_json("")
test["sections"] = test.apply(map_sections, axis=1)

In [None]:
data = pd.read_json("")
data.intents = data.intents.map(lambda x: [x] if type(x) == str else x)
data.sections = data.sections.map(lambda x: [x] if type(x) == str else x)
data = data.apply(move_surgery, axis=1).reset_index()

In [13]:
unique_intents_lookup = {}
unique_intents_lookup_id = 0
for row_id, row in data.iterrows():
    intent_string = ",".join(row.intents)
    if intent_string not in unique_intents_lookup.keys():
        unique_intents_lookup[intent_string] = unique_intents_lookup_id
        unique_intents_lookup_id += 1
unique_sections_lookup = {}
unique_sections_lookup_id = 0
for row_id, row in data.iterrows():
    section_string = ",".join(row.sections)
    if section_string not in unique_sections_lookup.keys():
        unique_sections_lookup[section_string] = unique_sections_lookup_id
        unique_sections_lookup_id += 1      

In [None]:

def gather_intent_sequence(conv, lookup=unique_intents_lookup):
    sequence_list = []
    for intents in conv["intents"]:
        sequence_list.append(lookup[",".join(intents)])
    return sequence_list
def gather_section_sequence(conv, lookup=unique_sections_lookup):
    sequence_list = []
    for sections in conv["sections"]:
        sequence_list.append(lookup[",".join(sections)])
    return sequence_list
def calculate_sequence_similarity(sequence, encounter_id, sequence_id, all_sequences, special_index=None):
    all_sequences = all_sequences.drop(sequence_id)

    tmp_mask = special_index["0"].map(lambda x:True if encounter_id in x else False)
    speciality_index = literal_eval(special_index["0"][tmp_mask].item())
    speciality_index.remove(encounter_id)
    filtered_sequences = all_sequences.query("encounter_id in @speciality_index")
    filtered_distances = filtered_sequences["section_sequences"].map(lambda x: distance(sequence, x))
    

    all_distances = all_sequences["section_sequences"].map(lambda x: distance(sequence, x))
    closest_id = all_sequences["encounter_id"].loc[all_distances.idxmin()]
    return pd.Series({"avg_all_dist":all_distances.mean(), "all_all_dist": all_distances.to_dict(), "avg_filtered_dist":filtered_distances.mean(), "filtered_all_dist": filtered_distances.to_dict(),"closest_id":closest_id, "share_speciality":closest_id in speciality_index})


In [None]:
grouped = data.groupby("encounter_id").apply(lambda x: x.drop("encounter_id", axis=1).to_dict(orient="list")).reset_index(name="data")

In [67]:
grouped["section_sequences"] = grouped.data.map(gather_section_sequence)

In [None]:
data["next_i_pred_src"] = data.apply(lambda x: prepare_next_intent_prediction_data(x, 5, data, False), axis=1)
n_i_p_data = data[["next_i_pred_src", "intents", "id"]]
n_i_p_data.dropna(inplace=True)
n_i_p_data = n_i_p_data.reset_index(drop=True)

In [7]:
data.text = data.text.map(lambda x: x.replace("doctor:", "[Doctor]").replace("patient:", "[Patient]").replace("patient_guest:", "[Patient Guest]"))

In [None]:
#Stratify next intent prediction
n_i_p_data["tuples"] = n_i_p_data.intents.apply(tuple)
unique_rows =n_i_p_data[~n_i_p_data["tuples"].duplicated(keep=False)]
n_i_p_data = n_i_p_data.drop(unique_rows.index)
# Example n_i_p_dataset
n_i_p_data = n_i_p_data[["next_i_pred_src", "intents"]]

# Split into train+dev and test with stratification
train_dev, test = train_test_split(n_i_p_data, test_size=0.15, stratify=n_i_p_data["intents"], random_state=42)

# Split train+dev into train and dev with stratification
train, dev = train_test_split(train_dev, test_size=0.15, stratify=train_dev['intents'], random_state=42)  # 0.25 * 0.8 = 0.2

# Print the sizes of each split
print(f"Train size: {len(train)}, Dev size: {len(dev)}, Test size: {len(test)}")

# Verify the stratification
print("Train label distribution:\n", train['intents'].value_counts(normalize=True))
print("Dev label distribution:\n", dev['intents'].value_counts(normalize=True))
print("Test label distribution:\n", test['intents'].value_counts(normalize=True))
train = pd.concat((train, unique_rows[["next_i_pred_src", "intents"]]))


In [None]:
#Stratify intent prediction
data["tuples"] = data.intents.apply(tuple)
unique_rows = data[~data["tuples"].duplicated(keep=False)]
data = data.drop(unique_rows.index)
# Example dataset
data = data[["text", "intents", "sections"]]

# Split into train+dev and test with stratification
train_dev, test = train_test_split(data, test_size=0.15, stratify=data["intents"], random_state=42)

# Split train+dev into train and dev with stratification
train, dev = train_test_split(train_dev, test_size=0.15, stratify=train_dev['intents'], random_state=42)  # 0.25 * 0.8 = 0.2

# Print the sizes of each split
print(f"Train size: {len(train)}, Dev size: {len(dev)}, Test size: {len(test)}")

# Verify the stratification
print("Train label distribution:\n", train['intents'].value_counts(normalize=True))
print("Dev label distribution:\n", dev['intents'].value_counts(normalize=True))
print("Test label distribution:\n", test['intents'].value_counts(normalize=True))
train = pd.concat((train, unique_rows[["text", "intents", "sections"]]))
