In [None]:
import pandas as pd
import altair as alt

In [None]:
mistral_df_without_db = pd.read_csv('combined_augmented_mistral_nodb.csv')
mistral_df_with_db = pd.read_csv('combined_augmented_mistral.csv')

In [None]:
for c in mistral_df_without_db.columns:
    if "Augmented" in c:
        print(c)

In [None]:


'Augmented_Age (years)',
'Augmented_Body Temperature (°C)'
'Augmented_Breathing Rate (breaths/min)',

df_with_db = pd.read_csv("./llama3_augmented_results/combined_augmented_all_llama3_1.csv")
df_without_db = pd.read_csv("./llama3_augmented_results/combined_augmented_all_nodb_llama3_1.csv")

In [None]:
maxbins = 50

In [None]:
# Create a histogram for the 'age' feature
gt_age_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("age:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_age_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Age (years):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 With DB",
        },
    )
)

# Create a faceted histogram
no_db_age_chart = (
    alt.Chart(df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Age (years):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 Without DB",
        },
    )
)

# mistral with db
mistral_db_age_chart = (
    alt.Chart(mistral_df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Age:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Mistral With DB",
        },
    )
)

# mistral no db
mistral_no_db_age_chart = (
    alt.Chart(mistral_df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Age:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Mistral Without DB",
        },
    )
)


# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
age_combined_chart = alt.vconcat(
    gt_age_chart,
    db_age_chart,
    no_db_age_chart,
    mistral_db_age_chart,
    mistral_no_db_age_chart,
).resolve_scale(x="shared")
age_combined_chart = age_combined_chart.properties(
    title={
        "text": "Age",
    }
)
age_combined_chart

In [None]:
# transform the temperature from Fahrenheit to Celsius
df_with_db["body_temperature"] = df_with_db["temperature"].apply(
    lambda x: (x - 32) * 5.0 / 9.0
)
# Create a histogram for the 'body_temperature' feature
gt_temp_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("body_temperature:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_temp_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Body Temperature (°C):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 With DB",
        },
    )
)

# Create a faceted histogram
no_db_temp_chart = (
    alt.Chart(df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Body Temperature (degrees Celsius):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 Without DB",
        },
    )
)

# Combine the original chart with the new temperature chart, placing the temperature chart below the original chart and sharing the x-axis
temp_combined_chart = alt.vconcat(
    gt_temp_chart,
    db_temp_chart,
    no_db_temp_chart,
).resolve_scale(x="shared")
temp_combined_chart = temp_combined_chart.properties(
    title={
        "text": "Body Temperature",
    }
)

temp_combined_chart


In [None]:
with_db_temperature_outliers_count = df_with_db["Augmented_Body Temperature (°C)"].apply(lambda x: x > 45).sum()
without_db_temperature_outliers_count = df_without_db["Augmented_Body Temperature (degrees Celsius)"].apply(lambda x: x > 45).sum()
gt_outliers_count = df_with_db["temperature"].apply(lambda x: x < 45).sum()

print(gt_outliers_count, with_db_temperature_outliers_count, without_db_temperature_outliers_count)

In [None]:
df_with_db["temperature"].min(), df_with_db["temperature"].max()

In [None]:
df_with_db["Augmented_Body Temperature (°C) fixed"] = df_with_db[
    "Augmented_Body Temperature (°C)"
].apply(lambda x: (x - 32) * 5.0 / 9.0 if x > 45 else x)

# 40 before calibration, but not after

In [None]:
# Create a histogram for the 'body_temperature' feature
gt_temp_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("body_temperature:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_temp_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Body Temperature (°C) fixed:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 With GraphRAG",
        },
    )
)

# Create a faceted histogram
no_db_temp_chart = (
    alt.Chart(df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Body Temperature (degrees Celsius):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 Without GraphRAG",
        },
    )
)

# Combine the original chart with the new temperature chart, placing the temperature chart below the original chart and sharing the x-axis
fixed_temp_combined_chart = alt.vconcat(
    gt_temp_chart,
    db_temp_chart,
    no_db_temp_chart,
).resolve_scale(x="shared")
fixed_temp_combined_chart = fixed_temp_combined_chart.properties(
    title={
        "text": "Fixed Body Temperature",
    }
)
fixed_temp_combined_chart

In [None]:
# Create a histogram for the 'resprate' feature
gt_resprate_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("resprate:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_resprate_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Breathing Rate (breaths/min):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 With GraphRAG",
        },
    )
)

# Create a faceted histogram
no_db_resprate_chart = (
    alt.Chart(df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Breathing Rate (breaths/min):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 Without GraphRAG",
        },
    )
)

# Combine the original chart with the new resprate chart, placing the resprate chart below the original chart and sharing the x-axis
resprate_combined_chart = alt.vconcat(
    gt_resprate_chart,
    db_resprate_chart,
    no_db_resprate_chart,
).resolve_scale(x="shared")
resprate_combined_chart = resprate_combined_chart.properties(
    title={
        "text": "Respiratory Rate",
    }
)
resprate_combined_chart

In [None]:
def to_oxygen_saturation(P_O2, P50=26.6, n=2.8):
    """
    Transform partial pressure of oxygen (P_O2 in mmHg) into oxygen saturation (S_O2 in %).
    
    Parameters:
    - P_O2: Partial pressure of oxygen in mmHg
    - P50: The partial pressure at which hemoglobin is 50% saturated (default: 26.6 mmHg)
    - n: Hill coefficient, describing the steepness of the curve (default: 2.8)
    
    Returns:
    - S_O2: Oxygen saturation as a percentage (%)
    """
    S_O2 = (P_O2**n / (P_O2**n + P50**n)) * 100
    return S_O2

In [None]:
df_with_db["Augmented_o2Sat"] = df_with_db["Augmented_Oxygen levels (mmHg)"].apply(lambda x: to_oxygen_saturation(x))
df_without_db["Augmented_o2Sat"] = df_without_db["Augmented_Oxygen levels (mmHg)"].apply(lambda x: to_oxygen_saturation(x))

In [None]:
# Create a histogram for the 'o2sat' feature
gt_o2sat_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("o2sat:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_o2sat_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_o2Sat:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 With GraphRAG",
        },
    )
)

# Create a faceted histogram
no_db_o2sat_chart = (
    alt.Chart(df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_o2Sat:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "LLama3 Without GraphRAG",
        },
    )
)

# Combine the original chart with the new o2sat chart, placing the o2sat chart below the original chart and sharing the x-axis
o2sat_combined_chart = alt.vconcat(
    gt_o2sat_chart,
    db_o2sat_chart,
    no_db_o2sat_chart,
).resolve_scale(x="shared")
o2sat_combined_chart = o2sat_combined_chart.properties(
    title={
        "text": "Oxygen Saturation",
    }
)
o2sat_combined_chart


In [None]:
alt.hconcat(
    age_combined_chart,
    temp_combined_chart,
    fixed_temp_combined_chart,
    o2sat_combined_chart,
    resprate_combined_chart,
)

In [None]:
# from scipy.stats import ks_2samp

# # Perform the Kolmogorov-Smirnov test
# ks_statistic_with_db, p_value_with_db = ks_2samp(df_with_db['age'], df_with_db["Augmented_Age (years)"])
# ks_statistic_without_db, p_value_without_db = ks_2samp(df_with_db['age'], df_without_db["Augmented_Age (years)"])


# print(ks_statistic_with_db, p_value_with_db)
# print(ks_statistic_without_db, p_value_without_db)

# from sklearn.metrics import root_mean_squared_error

# def get_rmse(
#     df_with_db,
#     df_without_db,
#     gt_col,
#     db_col,
#     without_db_col,
# ):
#     with_db_mse = root_mean_squared_error(df_with_db[gt_col], df_with_db[db_col])
#     without_db_mse = root_mean_squared_error(
#         df_with_db[gt_col], df_without_db[without_db_col]
#     )

#     ks_statistic_with_db, p_value_with_db = ks_2samp(df_with_db[gt_col], df_with_db[db_col])
#     ks_statistic_without_db, p_value_without_db = ks_2samp(df_with_db[gt_col], df_without_db[without_db_col])

#     print("RMSE with DB:", with_db_mse)
#     print("RMSE without DB:", without_db_mse)
#     print(f"KS Statistic with DB: [{ks_statistic_with_db}], P-value: [{p_value_with_db}]")
#     print(f"KS Statistic without DB: [{ks_statistic_without_db}], P-value: [{p_value_without_db}]")

# print("="*50)
# print("Age")
# get_rmse(
#     df_with_db,
#     df_without_db,
#     "age",
#     "Augmented_Age (years)",
#     "Augmented_Age (years)",
# )
# print("="*50)
# print("Body Temperature")
# get_rmse(
#     df_with_db,
#     df_without_db,
#     "temperature",
#     "Augmented_Body Temperature (°C)",
#     "Augmented_Body Temperature (degrees Celsius)",
# )
# print("="*50)
# print("Body Temperature Fixed")
# get_rmse(
#     df_with_db,
#     df_without_db,
#     "temperature",
#     "Augmented_Body Temperature (°C) fixed",
#     "Augmented_Body Temperature (degrees Celsius)",
# )
# print("="*50)
# print("Respiratory Rate")
# get_rmse(
#     df_with_db,
#     df_without_db,
#     "resprate",
#     "Augmented_Breathing Rate (breaths/min)",
#     "Augmented_Breathing Rate (breaths/min)",
# )
# print("="*50)
# print("Oxygen Saturation")
# get_rmse(
#     df_with_db,
#     df_without_db,
#     "o2sat",
#     "Augmented_Oxygen levels (mmHg)",
#     "Augmented_Oxygen levels (mmHg)",
# )