In [None]:
import pandas as pd
import altair as alt

# 'Augmented_Age (years)',
# 'Augmented_Body Temperature (°C)'
# 'Augmented_Breathing Rate (breaths/min)',

df_with_db = pd.read_csv("combined_augmented_all_llama3_1.csv")
df_without_db = pd.read_csv("combined_augmented_all_nodb_llama3_1.csv")
df_with_db = df_with_db.sort_values(by='Unnamed: 0').reset_index(drop=True)
df_without_db = df_without_db.sort_values(by='Unnamed: 0').reset_index(drop=True)

In [None]:
maxbins = 20

In [None]:
# Create a histogram for the 'age' feature
gt_age_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("age:Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_age_chart = (
    alt.Chart(df_with_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Age (years):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "With DB",
        },
    )
)

# Create a faceted histogram
no_db_age_chart = (
    alt.Chart(df_without_db)
    .mark_bar()
    .encode(
        alt.X("Augmented_Age (years):Q", bin=alt.Bin(maxbins=maxbins)),
        y="count()",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Without DB",
        },
    )
)

# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
age_combined_chart = alt.vconcat(
    gt_age_chart,
    db_age_chart,
    no_db_age_chart,
).resolve_scale(x="shared")
age_combined_chart = age_combined_chart.properties(
    title={
        "text": "Age",
    }
)
age_combined_chart

In [None]:
# Create a density plot for the 'age' feature
gt_age_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "age",
        as_=["age", "density"],
    )
    .mark_area()
    .encode(
        x="age:Q",
        y="density:Q",
        # facet="Ground truth:N",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_age_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "Augmented_Age (years)",
        as_=["Augmented_Age (years)", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Age (years):Q",
        y="density:Q",
        # facet="With DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "With DB",
        },
    )
)

# Create a faceted density plot
no_db_age_chart = (
    alt.Chart(df_without_db)
    .transform_density(
        "Augmented_Age (years)",
        as_=["Augmented_Age (years)", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Age (years):Q",
        y="density:Q",
        # facet="Without DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Without DB",
        },
    )
)


# # Combine the original chart with the new age chart
# combined_chart = alt.hconcat(chart, age_chart)
# combined_chart
# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
age_combined_chart = alt.vconcat(
    gt_age_chart,
    db_age_chart,
    no_db_age_chart,
).resolve_scale(x="shared")
age_combined_chart = age_combined_chart.properties(
    title={
        "text": "Age",
    }
)

In [None]:
df_with_db[df_with_db["Augmented_Body Temperature (°C)"] > 40]["Augmented_Body Temperature (°C)"]

In [None]:
df_with_db[df_with_db['Augmented_Body Temperature (°C)'] > 40]['Augmented_Body Temperature (°C)']

In [None]:
# # db_temp_chart = (
# #     alt.Chart(df_with_db)
# #     .transform_density(
# #         "Augmented_Body Temperature (°C)",
# #         as_=["Augmented_Body Temperature (°C)", "density"],
# #     )
# #     .mark_area()
# #     .encode(
# #         x="Augmented_Body Temperature (°C):Q",
# #         y="density:Q",
# #         # facet="With DB:N",  # Replace 'column_name' with the actual column you want to facet by
# #     )
# #     .properties(
# #         width=200,
# #         height=100,
# #         title={
# #             "text": "With GraphRAG",
# #         },
# #     )
# # )
# # db_temp_chart

# import seaborn as sns
# import matplotlib.pyplot as plt

# # Plot the density of "Augmented_Body Temperature (°C)" using seaborn
# plt.figure(figsize=(10, 6))
# sns.kdeplot(df_with_db["Augmented_Body Temperature (°C)"], label="With GraphRAG")
# plt.title("Density Plot of Augmented Body Temperature (°C)")
# plt.xlabel("Augmented Body Temperature (°C)")
# plt.ylabel("Density")
# plt.legend()
# plt.show()


# # If the kernel crashes, it might be due to a variety of reasons. Here are some common issues and search queries to help you troubleshoot:

# # 1. Memory Issues:
# # If your dataset is too large, it might be causing memory issues.
# # Search query: "Jupyter notebook kernel crashes due to memory issues"

# # 2. Compatibility Issues:
# # There might be compatibility issues between different versions of libraries.
# # Search query: "Seaborn matplotlib compatibility issues kernel crash"

# # 3. Backend Issues:
# # Sometimes, the backend used by matplotlib can cause issues.
# # Search query: "Matplotlib backend causing kernel crash Jupyter notebook"

# # 4. Data Issues:
# # There might be issues with the data itself, such as NaN values or infinite values.
# # Search query: "Seaborn kernel crash due to NaN values"

# # 5. Environment Issues:
# # There might be issues with the Python environment or Jupyter notebook installation.
# # Search query: "Jupyter notebook kernel crashes environment issues"

# # Example search query:
# # "Jupyter notebook kernel crashes when plotting with seaborn"

In [None]:
df_with_db["body_temperature"] = df_with_db["temperature"].apply(
    lambda x: (x - 32) * 5.0 / 9.0
)
# Create a density plot for the 'age' feature
gt_temp_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "body_temperature",
        as_=["temperature", "density"],
    )
    .mark_area()
    .encode(
        x="temperature:Q",
        y="density:Q",
        # facet="Ground truth:N",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_temp_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "Augmented_Body Temperature (°C)",
        as_=["Augmented_Body Temperature (°C)", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Body Temperature (°C):Q",
        y="density:Q",
        # facet="With DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "With GraphRAG",
        },
    )
)

# Create a faceted density plot
no_db_temp_chart = (
    alt.Chart(df_without_db)
    .transform_density(
        "Augmented_Body Temperature (degrees Celsius)",
        as_=["Augmented_Body Temperature (degrees Celsius)", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Body Temperature (degrees Celsius):Q",
        y="density:Q",
        # facet="Without DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Without GraphRAG",
        },
    )
)


# # Combine the original chart with the new age chart
# combined_chart = alt.hconcat(chart, age_chart)
# combined_chart
# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
temp_combined_chart = alt.vconcat(
    gt_temp_chart,
    db_temp_chart,
    no_db_temp_chart,
).resolve_scale(x="shared")
temp_combined_chart = temp_combined_chart.properties(
    title={
        "text": "Body Temperature",
    }
)

temp_combined_chart

In [None]:
with_db_temperature_outliers_count = df_with_db["Augmented_Body Temperature (°C)"].apply(lambda x: x > 45).sum()
without_db_temperature_outliers_count = df_without_db["Augmented_Body Temperature (degrees Celsius)"].apply(lambda x: x > 45).sum()
gt_outliers_count = df_with_db["temperature"].apply(lambda x: x < 45).sum()

print(gt_outliers_count, with_db_temperature_outliers_count, without_db_temperature_outliers_count)

In [None]:
df_with_db["temperature"].min(), df_with_db["temperature"].max()

In [None]:
df_with_db["Augmented_Body Temperature (°C) fixed"] = df_with_db[
    "Augmented_Body Temperature (°C)"
].apply(lambda x: (x - 32) * 5.0 / 9.0 if x > 45 else x)

# 40 before calibration, but not after



In [None]:
df_with_db['resprate']

In [None]:
# Create a density plot for the 'age' feature
gt_temp_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "body_temperature",
        as_=["temperature", "density"],
    )
    .mark_area()
    .encode(
        x="temperature:Q",
        y="density:Q",
        # facet="Ground truth:N",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_temp_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "Augmented_Body Temperature (°C) fixed",
        as_=["Augmented_Body Temperature (°C) fixed", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Body Temperature (°C) fixed:Q",
        y="density:Q",
        # facet="With DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "With GraphRAG",
        },
    )
)

# Create a faceted density plot
no_db_temp_chart = (
    alt.Chart(df_without_db)
    .transform_density(
        "Augmented_Body Temperature (degrees Celsius)",
        as_=["Augmented_Body Temperature (degrees Celsius)", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Body Temperature (degrees Celsius):Q",
        y="density:Q",
        # facet="Without DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Without GraphRAG",
        },
    )
)

# # Combine the original chart with the new age chart
# combined_chart = alt.hconcat(chart, age_chart)
# combined_chart
# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
fixed_temp_combined_chart = alt.vconcat(
    gt_temp_chart,
    db_temp_chart,
    no_db_temp_chart,
).resolve_scale(x="shared")
fixed_temp_combined_chart = fixed_temp_combined_chart.properties(
    title={
        "text": "Fixed Body Temperature",
    }
)

In [None]:
# Create a density plot for the 'age' feature
gt_resprate_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "resprate",
        as_=["resprate", "density"],
    )
    .mark_area()
    .encode(
        x="resprate:Q",
        y="density:Q",
        # facet="Ground truth:N",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_resprate_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        'Augmented_Breathing Rate (breaths/min)',
        as_=['Augmented_Breathing Rate (breaths/min)', "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Breathing Rate (breaths/min):Q",
        y="density:Q",
        # facet="With DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "With GraphRAG",
        },
    )
)

# Create a faceted density plot
no_db_resprate_chart = (
    alt.Chart(df_without_db)
    .transform_density(
        "Augmented_Breathing Rate (breaths/min)",
        as_=["Augmented_Breathing Rate (breaths/min)", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_Breathing Rate (breaths/min):Q",
        y="density:Q",
        # facet="Without DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Without GraphRAG",
        },
    )
)

# # Combine the original chart with the new age chart
# combined_chart = alt.hconcat(chart, age_chart)
# combined_chart
# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
resprate_combined_chart = alt.vconcat(
    gt_resprate_chart,
    db_resprate_chart,
    no_db_resprate_chart,
).resolve_scale(x="shared")
resprate_combined_chart = resprate_combined_chart.properties(
    title={
        "text": "Respiratory Rate",
    }
)

In [None]:
df_with_db["o2sat"]

In [None]:
def to_oxygen_saturation(P_O2, P50=26.6, n=2.8):
    """
    Transform partial pressure of oxygen (P_O2 in mmHg) into oxygen saturation (S_O2 in %).
    
    Parameters:
    - P_O2: Partial pressure of oxygen in mmHg
    - P50: The partial pressure at which hemoglobin is 50% saturated (default: 26.6 mmHg)
    - n: Hill coefficient, describing the steepness of the curve (default: 2.8)
    
    Returns:
    - S_O2: Oxygen saturation as a percentage (%)
    """
    S_O2 = (P_O2**n / (P_O2**n + P50**n)) * 100
    return S_O2

In [None]:
df_with_db["Augmented_o2Sat"] = df_with_db["Augmented_Oxygen levels (mmHg)"].apply(lambda x: to_oxygen_saturation(x))
df_without_db["Augmented_o2Sat"] = df_without_db["Augmented_Oxygen levels (mmHg)"].apply(lambda x: to_oxygen_saturation(x))

In [None]:
# 'o2sat'
# 'Augmented_Oxygen levels (mmHg)'

# Create a density plot for the 'age' feature
gt_o2sat_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "o2sat",
        as_=["o2sat", "density"],
    )
    .mark_area()
    .encode(
        x="o2sat:Q",
        y="density:Q",
        # facet="Ground truth:N",
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Ground Truth",
        },
    )
)

db_o2sat_chart = (
    alt.Chart(df_with_db)
    .transform_density(
        "Augmented_o2Sat",
        as_=["Augmented_o2Sat", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_o2Sat:Q",
        y="density:Q",
        # facet="With DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "With GraphRAG",
        },
    )
)

# Create a faceted density plot
no_db_o2sat_chart = (
    alt.Chart(df_without_db)
    .transform_density(
        "Augmented_o2Sat",
        as_=["Augmented_o2Sat", "density"],
    )
    .mark_area()
    .encode(
        x="Augmented_o2Sat:Q",
        y="density:Q",
        # facet="Without DB:N",  # Replace 'column_name' with the actual column you want to facet by
    )
    .properties(
        width=200,
        height=100,
        title={
            "text": "Without GraphRAG",
        },
    )
)


# # Combine the original chart with the new age chart
# combined_chart = alt.hconcat(chart, age_chart)
# combined_chart
# Combine the original chart with the new age chart, placing the age chart below the original chart and sharing the x-axis
o2sat_combined_chart = alt.vconcat(
    gt_o2sat_chart,
    db_o2sat_chart,
    no_db_o2sat_chart,
).resolve_scale(x="shared")
o2sat_combined_chart = o2sat_combined_chart.properties(
    title={
        "text": "Oxygen Saturation",
    }
)
o2sat_combined_chart

In [None]:
alt.hconcat(
    age_combined_chart,
    temp_combined_chart,
    fixed_temp_combined_chart,
    o2sat_combined_chart,
    resprate_combined_chart,
)

In [None]:
from sklearn.metrics import root_mean_squared_error
from scipy.stats import ks_2samp
from scipy.spatial.distance import jensenshannon
import pandas as pd

def calculate_jsd(p, q):
    return jensenshannon(p, q, base=2)

def get_rmse(
    df_with_db,
    df_without_db,
    gt_col,
    db_col,
    without_db_col,
):
    with_db_rmse = root_mean_squared_error(df_with_db[gt_col], df_with_db[db_col])
    without_db_rmse = root_mean_squared_error(df_with_db[gt_col], df_without_db[without_db_col])

    ks_statistic_with_db, p_value_with_db = ks_2samp(df_with_db[gt_col], df_with_db[db_col])
    ks_statistic_without_db, p_value_without_db = ks_2samp(df_with_db[gt_col], df_without_db[without_db_col])

    with_db_jsd = calculate_jsd(df_with_db[gt_col], df_with_db[db_col])
    without_db_jsd = calculate_jsd(df_with_db[gt_col], df_without_db[without_db_col])

    print(f"RMSE with DB: {with_db_rmse}")
    print(f"RMSE without DB: {without_db_rmse}")
    print(f"KS Statistic with DB: {ks_statistic_with_db}, P-value with DB: {p_value_with_db}")
    print(f"KS Statistic without DB: {ks_statistic_without_db}, P-value without DB: {p_value_without_db}")
    print(f"Jensen-Shannon Divergence with DB: {with_db_jsd}")
    print(f"Jensen-Shannon Divergence without DB: {without_db_jsd}")

    return {
        "RMSE with DB": with_db_rmse,
        "RMSE without DB": without_db_rmse,
        "KS Statistic with DB": ks_statistic_with_db,
        "KS Statistic without DB": ks_statistic_without_db,
        "P-value with DB": p_value_with_db,
        "P-value without DB": p_value_without_db,
        "Jensen-Shannon Divergence with DB": with_db_jsd,
        "Jensen-Shannon Divergence without DB": without_db_jsd,
    }

results = []
print("="*40)
print("Running feature: Age")
results.append({"Feature": "Age", **get_rmse(
    df_with_db,
    df_without_db,
    "age",
    "Augmented_Age (years)",
    "Augmented_Age (years)",
)})
print("="*40)
print("Running feature: Body Temperature")
results.append({"Feature": "Body Temperature", **get_rmse(
    df_with_db,
    df_without_db,
    "temperature",
    "Augmented_Body Temperature (°C)",
    "Augmented_Body Temperature (degrees Celsius)",
)})
print("="*40)
print("Running feature: Body Temperature Fixed")
results.append({"Feature": "Body Temperature Fixed", **get_rmse(
    df_with_db,
    df_without_db,
    "temperature",
    "Augmented_Body Temperature (°C) fixed",
    "Augmented_Body Temperature (degrees Celsius)",
)})
print("="*40)
print("Running feature: Respiratory Rate")
results.append({"Feature": "Respiratory Rate", **get_rmse(
    df_with_db,
    df_without_db,
    "resprate",
    "Augmented_Breathing Rate (breaths/min)",
    "Augmented_Breathing Rate (breaths/min)",
)})
print("="*40)
print("Running feature: Oxygen Saturation")
results.append({"Feature": "Oxygen Saturation", **get_rmse(
    df_with_db,
    df_without_db,
    "o2sat",
    "Augmented_Oxygen levels (mmHg)",
    "Augmented_Oxygen levels (mmHg)",
)})

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results).transpose()
results_df.to_csv("augmentation_eval.csv")

In [None]:
results_df