In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from collections import OrderedDict

pd.set_option("display.precision", 2)

In [2]:
aug_features =  [
            "temperature",
            "heartrate",
            "resprate",
            "o2sat",
            "sbp",
            "dbp",
        ]

In [3]:
df_paths = [
    "./spreadsheets/biogpt_aug_text.csv",
    "./spreadsheets/biogpt_aug_report.csv",
    "./spreadsheets/cb_aug_text.csv",
    "./spreadsheets/cb_aug_report.csv",
    "./spreadsheets/bcb_aug_text.csv",
    "./spreadsheets/bcb_aug_report.csv",
    "./spreadsheets/mistral_aug_text.csv",
    "./spreadsheets/mistral_aug_report.csv",
    "./spreadsheets/zephyr_aug_text.csv",
    "./spreadsheets/zephyr_aug_report.csv",
    "./spreadsheets/llama2_aug_text.csv",
    "./spreadsheets/llama2_aug_report.csv",
    "./spreadsheets/gpt3_5_aug_text.csv",
    "./spreadsheets/gpt3_5_aug_report.csv",
    "./spreadsheets/gpt4_aug_text.csv",
    "./spreadsheets/gpt4_aug_report.csv",
    "./spreadsheets/gaussian_aug.csv",
    "./spreadsheets/mean_aug.csv",
]

In [4]:
path_to_name = OrderedDict({
    "./spreadsheets/biogpt_aug_text.csv": "BioGPT\nplain-text",
    "./spreadsheets/biogpt_aug_report.csv": "BioGPT\nreport-template",
    "./spreadsheets/cb_aug_text.csv": "ClinicalBERT\nplain-text",
    "./spreadsheets/cb_aug_report.csv": "ClinicalBERT\nreport-template",
    "./spreadsheets/bcb_aug_text.csv": "BioClinicalBERT\nplain-text",
    "./spreadsheets/bcb_aug_report.csv": "BioClinicalBERT\nreport-template",
    "./spreadsheets/mistral_aug_text.csv": "Mistral\nplain-text",
    "./spreadsheets/mistral_aug_report.csv": "Mistral\nreport-template",
    "./spreadsheets/zephyr_aug_text.csv": "Zephyr\nplain-text",
    "./spreadsheets/zephyr_aug_report.csv": "Zephyr\nreport-template",
    "./spreadsheets/llama2_aug_text.csv": "Llama2\nplain-text",
    "./spreadsheets/llama2_aug_report.csv": "Llama2\nreport-template",
    './spreadsheets/gpt3_5_aug_text.csv':"GPT3.5\nplain-text",
    './spreadsheets/gpt3_5_aug_report.csv': "GPT3.5\nreport-template",
    './spreadsheets/gpt4_aug_text.csv':"GPT4\nplain-text",
    './spreadsheets/gpt4_aug_report.csv': "GPT4\nreport-template",
    "./spreadsheets/gaussian_aug.csv": "Gaussian",
    "./spreadsheets/mean_aug.csv": "Mean"
})

In [5]:
# path_to_name = OrderedDict({
#     "./spreadsheets/gaussian_aug.csv": "gaussian_aug",
#     "./spreadsheets/biogpt_aug_text.csv": "biogpt_aug_text",
#     "./spreadsheets/biogpt_aug_report.csv": "biogpt_aug_report",
#     "./spreadsheets/cb_aug_text.csv": "cb_aug_text",
#     "./spreadsheets/cb_aug_report.csv": "cb_aug_report",
#     "./spreadsheets/bcb_aug_text.csv": "bcb_aug_text",
#     "./spreadsheets/bcb_aug_report.csv": "bcb_aug_report",
#     "./spreadsheets/mistral_aug_text.csv": "mistral_aug_text",
#     "./spreadsheets/mistral_aug_report.csv": "mistral_aug_report",
#     "./spreadsheets/zephyr_aug_text.csv": "zephyr_aug_text",
#     "./spreadsheets/zephyr_aug_report.csv": "zephyr_aug_report",
#     "./spreadsheets/llama2_aug_text.csv": "llama2_aug_text",
#     "./spreadsheets/llama2_aug_report.csv": "llama2_aug_report",
# })

In [6]:
# also have to count the missing value for each features, and also the total.

In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def get_mse(path, aug_features, standardise= False):
    df = pd.read_csv(path)
    if standardise:
        scaler = StandardScaler()
        scaler.fit(df[aug_features])
        df[aug_features] = scaler.transform(df[aug_features])
        df[[f"aug_{f}" for f in aug_features]] = scaler.transform(df[[f"aug_{f}" for f in aug_features]].rename(columns={f"aug_{f}": f for f in aug_features}))

    return {f: mean_squared_error(df[f], df[f"aug_{f}"]) for f in aug_features} 

In [8]:
# standardise = True
eval_dict_list = []

for p in df_paths:
    df = pd.read_csv(p)

    # check missing values:
    eval_dict = {}
    for f in aug_features:
        eval_dict.update({f"#missing ({f})" :df[f'aug_{f}'].isna().sum()})

    # if standardise:
    #     scaler = StandardScaler()
    #     scaler.fit(df[aug_features])
    #     df[aug_features] = scaler.transform(df[aug_features])
    #     df[[f"aug_{f}" for f in aug_features]] = scaler.transform(df[[f"aug_{f}" for f in aug_features]].rename(columns={f"aug_{f}": f for f in aug_features}))


    mse_total = 0
    missing_total = 0
    for f in aug_features:
        isna_rows = df[f"aug_{f}"].isna()
        missing_total += isna_rows.sum()

        #only keep the 
        mse_v = mean_squared_error(df[~isna_rows][f], df[~isna_rows][f"aug_{f}"])
        mse_total += mse_v
        eval_dict.update({f"mse_{f}": mse_v})

    mse_norm_total = 0
    for f in aug_features:
        isna_rows = df[f"aug_{f}"].isna()
        
        # norm ver.
        scaler = StandardScaler()
        scaler.fit(df[[f]])
        df[f] = scaler.transform(df[[f]])
        df[f"aug_{f}"] = scaler.transform(df[[f"aug_{f}"]].rename(columns={f"aug_{f}": f}))

        # calculate norm errors.
        mse_v_norm = mean_squared_error(df[~isna_rows][f], df[~isna_rows][f"aug_{f}"])
        mse_norm_total += mse_v_norm
        eval_dict.update({f"mse_{f} (norm)": mse_v_norm})

    eval_dict.update({"#missing (total)": missing_total})
    eval_dict.update({"mse_mean": mse_total/len(aug_features)})
    eval_dict.update({"mse_mean (norm)": mse_norm_total/len(aug_features)})

    eval_dict.update({"name": path_to_name[p]})
    eval_dict_list.append(eval_dict)

    

In [9]:
llms_eval_df = pd.DataFrame(eval_dict_list)#.sort_values("mse_mean (norm)")

In [10]:
llms_eval_df

Unnamed: 0,#missing (temperature),#missing (heartrate),#missing (resprate),#missing (o2sat),#missing (sbp),#missing (dbp),mse_temperature,mse_heartrate,mse_resprate,mse_o2sat,...,mse_temperature (norm),mse_heartrate (norm),mse_resprate (norm),mse_o2sat (norm),mse_sbp (norm),mse_dbp (norm),#missing (total),mse_mean,mse_mean (norm),name
0,0,0,0,0,0,0,6.17,847.01,147.03,67.54,...,4.11,2.2,16.38,10.52,3.97,1.31,0,652.84,6.42,BioGPT\nplain-text
1,0,0,0,0,0,0,9.18,666.71,971.75,50.05,...,6.12,1.73,108.27,7.8,3.63,1.84,0,752.07,21.57,BioGPT\nreport-template
2,0,0,0,0,0,0,7.44,639.34,421.58,17.82,...,4.96,1.66,46.97,2.78,1.82,6.4,0,713.92,10.76,ClinicalBERT\nplain-text
3,0,0,0,0,0,0,8.43,1024.25,205.35,14.75,...,5.62,2.66,22.88,2.3,6.09,1.44,0,906.79,6.83,ClinicalBERT\nreport-template
4,0,0,0,0,0,0,24.06,594.1,83.42,58.85,...,16.04,1.54,9.29,9.17,2.78,3.09,0,577.91,6.99,BioClinicalBERT\nplain-text
5,124,0,0,0,0,1,6.3,606.79,64.89,9.41,...,4.2,1.58,7.23,1.47,5.01,1.66,125,714.6,3.52,BioClinicalBERT\nreport-template
6,0,0,0,0,0,0,7.69,926.78,97.65,71.82,...,5.13,2.41,10.88,11.19,2.61,3.86,0,659.01,6.01,Mistral\nplain-text
7,0,0,0,0,0,0,5.45,738.45,52.43,50.22,...,3.63,1.92,5.84,7.82,2.28,3.06,0,539.59,4.09,Mistral\nreport-template
8,81,452,695,0,19,37,3.47,834.23,130.07,29.1,...,2.31,2.17,14.49,4.53,1.8,2.51,1284,486.04,4.64,Zephyr\nplain-text
9,0,0,0,0,0,0,3.21,592.6,27.47,26.61,...,2.14,1.54,3.06,4.15,1.77,1.7,0,380.87,2.39,Zephyr\nreport-template


In [11]:
llms_eval_df.columns

Index(['#missing (temperature)', '#missing (heartrate)', '#missing (resprate)',
       '#missing (o2sat)', '#missing (sbp)', '#missing (dbp)',
       'mse_temperature', 'mse_heartrate', 'mse_resprate', 'mse_o2sat',
       'mse_sbp', 'mse_dbp', 'mse_temperature (norm)', 'mse_heartrate (norm)',
       'mse_resprate (norm)', 'mse_o2sat (norm)', 'mse_sbp (norm)',
       'mse_dbp (norm)', '#missing (total)', 'mse_mean', 'mse_mean (norm)',
       'name'],
      dtype='object')

In [12]:
llms_eval_df['#missing (total)']

0        0
1        0
2        0
3        0
4        0
5      125
6        0
7        0
8     1284
9        0
10       0
11       0
12       2
13    1707
14       0
15       0
16       0
17       0
Name: #missing (total), dtype: int64

In [55]:
def get_prompt_type(model_name):
    if model_name.endswith("plain-text"):
        return "plain-text"

    elif model_name.endswith("report-template"):
        return "report-template"

    else:
        return ""

In [56]:
def get_model_name(name):
    return name.split("\n")[0]

In [57]:
llms_eval_df['Prompt Type'] = llms_eval_df['name'].apply(lambda x: get_prompt_type(x))

In [58]:
llms_eval_df['Model'] = llms_eval_df['name'].apply(lambda x: get_model_name(x))

In [59]:
llms_eval_df

Unnamed: 0,#missing (temperature),#missing (heartrate),#missing (resprate),#missing (o2sat),#missing (sbp),#missing (dbp),mse_temperature,mse_heartrate,mse_resprate,mse_o2sat,...,mse_resprate (norm),mse_o2sat (norm),mse_sbp (norm),mse_dbp (norm),#missing (total),mse_mean,mse_mean (norm),name,Prompt Type,Model
0,0,0,0,0,0,0,6.17,847.01,147.03,67.54,...,16.38,10.52,3.97,1.31,0,652.84,6.42,BioGPT\nplain-text,plain-text,BioGPT
1,0,0,0,0,0,0,9.18,666.71,971.75,50.05,...,108.27,7.8,3.63,1.84,0,752.07,21.57,BioGPT\nreport-template,report-template,BioGPT
2,0,0,0,0,0,0,7.44,639.34,421.58,17.82,...,46.97,2.78,1.82,6.4,0,713.92,10.76,ClinicalBERT\nplain-text,plain-text,ClinicalBERT
3,0,0,0,0,0,0,8.43,1024.25,205.35,14.75,...,22.88,2.3,6.09,1.44,0,906.79,6.83,ClinicalBERT\nreport-template,report-template,ClinicalBERT
4,0,0,0,0,0,0,24.06,594.1,83.42,58.85,...,9.29,9.17,2.78,3.09,0,577.91,6.99,BioClinicalBERT\nplain-text,plain-text,BioClinicalBERT
5,124,0,0,0,0,1,6.3,606.79,64.89,9.41,...,7.23,1.47,5.01,1.66,125,714.6,3.52,BioClinicalBERT\nreport-template,report-template,BioClinicalBERT
6,0,0,0,0,0,0,7.69,926.78,97.65,71.82,...,10.88,11.19,2.61,3.86,0,659.01,6.01,Mistral\nplain-text,plain-text,Mistral
7,0,0,0,0,0,0,5.45,738.45,52.43,50.22,...,5.84,7.82,2.28,3.06,0,539.59,4.09,Mistral\nreport-template,report-template,Mistral
8,81,452,695,0,19,37,3.47,834.23,130.07,29.1,...,14.49,4.53,1.8,2.51,1284,486.04,4.64,Zephyr\nplain-text,plain-text,Zephyr
9,0,0,0,0,0,0,3.21,592.6,27.47,26.61,...,3.06,4.15,1.77,1.7,0,380.87,2.39,Zephyr\nreport-template,report-template,Zephyr


In [60]:
llms_eval_df.to_csv("eval.csv")

In [61]:
missing_df = llms_eval_df[
    [
        "Model",
        "Prompt Type",
        "#missing (temperature)",
        "#missing (heartrate)",
        "#missing (resprate)",
        "#missing (o2sat)",
        "#missing (sbp)",
        "#missing (dbp)",
        "#missing (total)",
    ]
].rename(
    columns={
        "#missing (temperature)": "temperature",
        "#missing (heartrate)": "heartrate",
        "#missing (resprate)": "resprate",
        "#missing (o2sat)": "o2sat",
        "#missing (sbp)": "sbp",
        "#missing (dbp)": "dbp",
        "#missing (total)": "total",
    },
)

In [62]:
missing_df = missing_df[~(missing_df['Prompt Type'] == "")] 

In [63]:
missing_df.to_csv("missing.csv")

In [67]:
mse_df = llms_eval_df[
    [
        "Model",
        "Prompt Type",
        "mse_temperature (norm)",
        "mse_heartrate (norm)",
        "mse_resprate (norm)",
        "mse_o2sat (norm)",
        "mse_sbp (norm)",
        "mse_dbp (norm)",
        "mse_mean (norm)",
    ]
].rename(
    columns={
    "mse_temperature (norm)": "temperature",
    "mse_heartrate (norm)": "heartrate",
    "mse_resprate (norm)": "resprate",
    "mse_o2sat (norm)": "o2sat",
    "mse_sbp (norm)": "sbp",
    "mse_dbp (norm)": "dbp",
    "mse_mean (norm)": "Mean",
    },
)

In [68]:
mse_df

Unnamed: 0,Model,Prompt Type,temperature,heartrate,resprate,o2sat,sbp,dbp,Mean
0,BioGPT,plain-text,4.11,2.2,16.38,10.52,3.97,1.31,6.42
1,BioGPT,report-template,6.12,1.73,108.27,7.8,3.63,1.84,21.57
2,ClinicalBERT,plain-text,4.96,1.66,46.97,2.78,1.82,6.4,10.76
3,ClinicalBERT,report-template,5.62,2.66,22.88,2.3,6.09,1.44,6.83
4,BioClinicalBERT,plain-text,16.04,1.54,9.29,9.17,2.78,3.09,6.99
5,BioClinicalBERT,report-template,4.2,1.58,7.23,1.47,5.01,1.66,3.52
6,Mistral,plain-text,5.13,2.41,10.88,11.19,2.61,3.86,6.01
7,Mistral,report-template,3.63,1.92,5.84,7.82,2.28,3.06,4.09
8,Zephyr,plain-text,2.31,2.17,14.49,4.53,1.8,2.51,4.64
9,Zephyr,report-template,2.14,1.54,3.06,4.15,1.77,1.7,2.39


In [69]:
mse_df.to_csv("mse.csv")

In [14]:
# mse_df = pd.DataFrame([get_mse(p, aug_features, standardise=True) for p in df_paths])
# mse_df.index = ["text", "report-like", "gaussian"]
# mse_df['mean'] = mse_df.mean(axis=1)
# mse_df