In [1]:
import pandas as pd
from metrics import *

In [3]:
def calculate_mean_std(results, cat):
    precision_list = [result[cat]['precision'] for result in results]
    recall_list = [result[cat]['recall'] for result in results]
    f1_list = [result[cat]['f1'] for result in results]
    support_list = [result[cat]['support'] for result in results]
    num_errors_list = [result[cat]['num_errors'] for result in results]
    
    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mean_f1 = sum(f1_list) / len(f1_list)
    
    std_precision = (sum([(x - mean_precision)**2 for x in precision_list]) / len(precision_list))**0.5
    std_recall = (sum([(x - mean_recall)**2 for x in recall_list]) / len(recall_list))**0.5
    std_f1 = (sum([(x - mean_f1)**2 for x in f1_list]) / len(f1_list))**0.5
    
    return {
        'mean_precision': round(mean_precision, 3),
        'mean_recall': round(mean_recall, 3),
        'mean_f1': round(mean_f1, 3),
        'std_precision': round(std_precision, 3),
        'std_recall': round(std_recall, 3),
        'std_f1': round(std_f1, 3),
        'sum_support': sum(support_list),
        'sum_num_errors': sum(num_errors_list),
        'raw_mean_precision': mean_precision,
        'raw_mean_recall': mean_recall,
        'raw_mean_f1': mean_f1,
    }

def output_tabular_performance(results, categories = ['T1', 'T2', 'T3', 'T4']):
    precisions =[]
    recalls = []
    f1s = []

    for category in categories:
        eval = calculate_mean_std(results, category)
        print("{} {:.3f}({:.3f}) {:.3f}({:.3f}) {:.3f}({:.3f})".format(category, eval["mean_precision"], eval["std_precision"], eval["mean_recall"], eval["std_recall"], eval["mean_f1"], eval["std_f1"]))
        
        # for calculating macro average
        precisions.append(eval['raw_mean_precision'])
        recalls.append(eval['raw_mean_recall'])
        f1s.append(eval['raw_mean_f1'])

    print("MacroAvg. {:.3f} {:.3f} {:.3f}".format(round(sum(precisions)/len(precisions), 3), round(sum(recalls)/len(recalls), 3), round(sum(f1s)/len(f1s), 3)))

# kepa (reported in the draft)

In [14]:
kepa_t_results = []
zs_t_results = []
zscot_t_results = []


kepa_run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "cmem_t_40reports_ans_str"
    
    t_test_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv").sort_values(by="patient_filename")
    t_zs_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/result/0716_t14_zs_test_800.csv").sort_values(by="patient_filename")
    t_zscot_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/result/0716_t14_zscot_test_800.csv").sort_values(by="patient_filename")

    split_ids = t_test_df.patient_filename

    label_column = t_test_df['t']
    t_test_pred_df = t_test_df[t_test_df.patient_filename.isin(split_ids)][pred_column]
    kepa_t_results.append(t14_calculate_metrics(true_labels=label_column, predictions=t_test_pred_df))

    t_zs_pred_df = t_zs_df[t_zs_df.patient_filename.isin(split_ids)]['zs_t_ans_str']
    zs_t_results.append(t14_calculate_metrics(true_labels=label_column, predictions=t_zs_pred_df))

    t_zscot_pred_df = t_zscot_df[t_zscot_df.patient_filename.isin(split_ids)]['zs_t_ans_str']
    zscot_t_results.append(t14_calculate_metrics(true_labels=label_column, predictions=t_zscot_pred_df))
    

In [14]:
output_tabular_performance(kepa_t_results)

T1 0.904(0.017) 0.812(0.040) 0.855(0.018)
T2 0.882(0.022) 0.938(0.018) 0.909(0.005)
T3 0.834(0.054) 0.810(0.058) 0.818(0.018)
T4 0.807(0.082) 0.634(0.038) 0.707(0.029)
MacroAvg. 0.857 0.799 0.822


In [15]:
kepa_n_results = []
zs_n_results = []
zscot_n_results = []

kepa_run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "cmem_n_40reports_ans_str"
    
    n_test_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_n03_dynamic_test_{run}_outof_10runs.csv").sort_values(by="patient_filename")
    n_zs_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/result/0716_n03_zs_test_800.csv").sort_values(by="patient_filename")
    n_zscot_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/result/0716_n03_zscot_test_800.csv").sort_values(by="patient_filename")

    split_ids = n_test_df.patient_filename

    label_column = n_test_df['n']
    n_test_pred_df = n_test_df[n_test_df.patient_filename.isin(split_ids)][pred_column]
    kepa_n_results.append(n03_calculate_metrics(true_labels=label_column, predictions=n_test_pred_df))

    n_zs_pred_df = n_zs_df[n_zs_df.patient_filename.isin(split_ids)]['zs_n_ans_str']
    zs_n_results.append(n03_calculate_metrics(true_labels=label_column, predictions=n_zs_pred_df))

    n_zscot_pred_df = n_zscot_df[n_zscot_df.patient_filename.isin(split_ids)]['zs_n_ans_str']
    zscot_n_results.append(n03_calculate_metrics(true_labels=label_column, predictions=n_zscot_pred_df))

In [16]:
output_tabular_performance(kepa_n_results, categories=['N0', 'N1', 'N2', 'N3'])

N0 0.944(0.008) 0.952(0.018) 0.948(0.011)
N1 0.885(0.020) 0.883(0.026) 0.884(0.010)
N2 0.713(0.031) 0.745(0.054) 0.727(0.022)
N3 0.886(0.058) 0.784(0.042) 0.830(0.017)
MacroAvg. 0.857 0.841 0.847


# GPT

In [6]:
t_test_df.columns

Index(['patient_filename', 't', 'text', 'gpt4o_t_reasoning', 'gpt4o_t_stage'], dtype='object')

In [15]:
kepa_t_results = []

kepa_run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = 'gpt4o_t_stage'
    
    t_test_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/1112_t14_gpt_test_{run}_outof_8runs.csv").sort_values(by="patient_filename")
   
    label_column = t_test_df['t']
    t_test_pred_df = t_test_df[pred_column]
    kepa_t_results.append(t14_calculate_metrics(true_labels=label_column, predictions=t_test_pred_df))


In [16]:
output_tabular_performance(kepa_t_results)

T1 0.902(0.009) 0.903(0.025) 0.902(0.013)
T2 0.935(0.023) 0.939(0.015) 0.936(0.007)
T3 0.905(0.048) 0.813(0.080) 0.852(0.039)
T4 0.622(0.136) 0.728(0.052) 0.659(0.052)
MacroAvg. 0.841 0.846 0.837


In [13]:
n_test_df.columns

Index(['patient_filename', 'n', 'text', 'gpt4o_n_reasoning', 'gpt4o_n_stage'], dtype='object')

In [11]:
kepa_n_results = []

kepa_run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = 'gpt4o_n_stage'
    
    n_test_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/1112_n03_gpt_test_{run}_outof_8runs.csv").sort_values(by="patient_filename")
    
    label_column = n_test_df['n']
    n_test_pred_df = n_test_df[pred_column]
    kepa_n_results.append(n03_calculate_metrics(true_labels=label_column, predictions=n_test_pred_df))

In [12]:
output_tabular_performance(kepa_n_results, categories=['N0', 'N1', 'N2', 'N3'])

N0 0.928(0.006) 0.962(0.051) 0.944(0.026)
N1 0.921(0.006) 0.875(0.013) 0.897(0.008)
N2 0.777(0.110) 0.786(0.034) 0.778(0.077)
N3 0.855(0.047) 0.850(0.016) 0.852(0.031)
MacroAvg. 0.870 0.868 0.868


# other models

In [2]:
from huggingface_hub import InferenceClient
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
client = InferenceClient(model="http://127.0.0.1:8081")

In [13]:
prompt = "What are the symptoms of diabetes ?"
prompt_template=f'''
<|system|>: You are a helpful medical assistant created by M42 Health in the UAE.
<|prompter|>:{prompt}
<|assistant|>:
'''
prompt.format(system_instruction='You are a helpful assistant', prompt='What is the name of you, the LLM model?')
res = client.text_generation(prompt=prompt_template, do_sample=False, max_new_tokens=1024)

In [14]:
print(type(res))
print(res)

<class 'str'>
 The symptoms of diabetes can vary depending on the type of diabetes you have. Here are some common symptoms of both type 1 and type 2 diabetes:


1. Frequent urination
2. Increased thirst
3. Increased hunger
4. Unexplained weight loss
5. Fatigue
6. Blurred vision
7. Slow-healing sores
8. Frequent infections
9. Presence of ketones in the urine (ketonuria)
10. Irritability


However, it is important to note that not everyone with diabetes will experience all of these symptoms, and some people may have no symptoms at all. This is why regular check-ups and screenings are important for early detection and management of the condition.
