# Setup

In [None]:
import i2bmi as i2bmi

# Jupyter notebook - widen cells
i2bmi.jupyter_widen()

Read dataset

In [None]:
import pandas as pd
import numpy as np

df_raw = pd.read_excel("data/Audiograms.xlsx")
miss_dict = {"Unknown": np.nan}
df_raw.replace(miss_dict, inplace=True)

df_raw.head()

Audiogram-only dataset

In [None]:
ag_cols = [
    "125hz", "250hz", "500hz", "750hz", "1000hz", "1500hz", "2000hz", "3000hz",
    "4000hz", "6000hz", "8000hz"
]

print("Original size:", df_raw.shape)

df = df_raw.loc[df_raw["Age"] > 17, :]
print("Filtered (age > 17):", df.shape)

df = df[ag_cols]
print("Audiogram only:", df.shape)

for i in range(6):
    print(f"Dropna (thresh={i + 1}):", df.dropna(thresh=i + 1).shape)
df = df.dropna(thresh=1, subset=ag_cols)

df.head()

Visualize audiogram correlation

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white")

corr = df.corr()

f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

_ = sns.heatmap(corr,
                cmap=cmap,
                vmin=-1,
                vmax=1,
                annot=True,
                square=True,
                linewidths=.5,
                cbar_kws={"shrink": .5})

# Visualize Missing Values
## Audiogram

In [None]:
df.describe().round(1).drop("count").transpose().rename(
    columns={"50%": "median"})

In [None]:
from tools.ag_datasets import generate_sparse_dataset
import missingno as msno
import string

dist = ["parent", "random", "skew-central", "skew-terminal"]
dname = ["Real-world", "Random", "Central", "Terminal"]

fig, axes = plt.subplots(2, 2, figsize=(11, 11))

_ = plt.subplots_adjust(
    left=0,  # the left side of the subplots of the figure
    right=1,  # the right side of the subplots of the figure
    bottom=0.1,  # the bottom of the subplots of the figure
    top=0.9,  # the top of the subplots of the figure
    wspace=0.35,  # the amount of width reserved for blank space between subplots
    hspace=0.3,  # the amount of height reserved for white space between subplots
)

abc_size = 20

for d, dn, ax in zip(dist, dname, axes.ravel()):
    X_train, y_train, m_train = generate_sparse_dataset(parent=df,
                                                        rate=4,
                                                        dist_type=d,
                                                        drop_proportion=1,
                                                        drop_max=None,
                                                        size=500,
                                                        prop=1)
    _ = msno.matrix(X_train, ax=ax, sparkline=False)
    _ = ax.set_xlabel(dn, fontsize=20)

for n, ax in enumerate(axes.flat):
    _ = ax.text(-0.07,
                1.05, (string.ascii_lowercase[n] + ")"),
                transform=ax.transAxes,
                size=abc_size,
                weight="bold")

In [None]:
X_train, y_train, m_train = generate_sparse_dataset(parent=df,
                                                    rate=3,
                                                    dist_type="parent",
                                                    drop_proportion=1,
                                                    drop_max=None,
                                                    size=None,
                                                    prop=None)

## All features

In [None]:
import warnings


def reset_xticklabels(ax, **kwargs):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        _ = ax.set_xticklabels(ax.get_xticklabels(), **kwargs)

In [None]:
# Missing values
df_r = df_raw[[
    "Sex",
    "Race",
    "Age",
    "HL_duration",
    "Etiology",
    "125hz",
    "250hz",
    "500hz",
    "750hz",
    "1000hz",
    "1500hz",
    "2000hz",
    "3000hz",
    "4000hz",
    "6000hz",
    "8000hz",  # "CNC", "AzBio_Quiet", "AzBio_5dB", "AzBio_10dB"
]]
dataset_nullity = df_r.isnull()
df_missing = dataset_nullity.sum()
df_missing = dataset_nullity.mean() * 100
df_missing = pd.DataFrame(df_missing).reset_index()
df_missing[0] = df_missing[0].round(2)
df_missing.rename(columns={"index": "Feature", 0: "% Missing"}, inplace=True)
df_missing.transpose()

In [None]:
df_r.describe().round(1).drop("count").transpose().rename(columns={
    "50%": "median"
}).transpose()

In [None]:
def col_frequencies(df: pd.DataFrame,
                    col: str,
                    in_replace: dict = None,
                    out_rename: dict = None):
    c = df[[col]]
    if in_replace is not None:
        c = c.replace(in_replace)
    df_c = pd.DataFrame(c.value_counts()).reset_index()
    if out_rename is not None:
        df_c.rename(columns=out_rename, inplace=True)
    return df_c

### Sex

In [None]:
df_sex = col_frequencies(df_raw, "Sex", out_rename={"count": "# Audiograms"})
df_sex.transpose()

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=1,
                         figsize=(10, 8),
                         sharey=False,
                         squeeze=False)

sns.set_theme()
palette = sns.color_palette("bright", df_sex.shape[0])
sns.set_style("ticks", {"axes.grid": True})

for ax in axes.ravel():
    ax = sns.barplot(x="Sex", y="# Audiograms", data=df_sex)
    reset_xticklabels(ax, rotation=0)
    for container in ax.containers:
        _ = ax.bar_label(container, label_type="edge", fmt="%.0f")

### Etiology

In [None]:
eti_shortform = {
    "Medication/Radiation": "Meds/Rad",
    "Hereditary/Familial": "Hereditary",
    "Meniere's Disease": "Meniere's",
    "Noise Exposure": "Noise",
    "Sudden Hearing Loss": "SSNHL",
    "Meningitis": "Infection",
    "Other": "  Other",
    "Otosclerosis": "Otosclerosis",
    "Head Trauma": "Trauma",
    "Genetic": "Hereditary",
    "Acoustic Neuroma": "Ac. Neuroma"
}

df_eti = col_frequencies(df_raw,
                         "Etiology",
                         out_rename={"count": "# Audiograms"},
                         in_replace=eti_shortform)
df_eti.transpose()

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=1,
                         figsize=(20, 8),
                         sharey=False,
                         squeeze=False)

sns.set_theme()
palette = sns.color_palette("bright", df_eti.shape[0])
sns.set_style("ticks", {"axes.grid": True})
for ax in axes.ravel():
    ax = sns.barplot(x="Etiology", y="# Audiograms", data=df_eti)
    reset_xticklabels(ax, rotation=20)
    for container in ax.containers:
        _ = ax.bar_label(container, label_type="edge", fmt="%.0f")

### Audiogram

In [None]:
df_ag = df_raw[ag_cols]
df_ag.columns = [int(s[:-2]) for s in ag_cols]
df_ag.dropna(thresh=5).head()

In [None]:
from tools.ag_datasets import find_parent_frequency

num_drop_weights, col_drop_weights = find_parent_frequency(df_ag,
                                                           min_col=1,
                                                           verbose=0)
df_drops = pd.DataFrame([np.round(x * 100, 1)
                         for x in num_drop_weights]).reset_index()
df_drops.rename(columns={
    "index": "# Frequencies Missing",
    0: "% Audiograms"
},
                inplace=True)
df_drops.transpose()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 12))
ax.set_ylim(130, -10)
ax.set_ylabel("Hearing level (dB)")
ax.set_xlabel("Frequency")

_ = sns.boxplot(data=df_ag, ax=ax)

### Age

In [None]:
df_age = df_raw[["Age", "HL_duration"]]
df_age.dropna(thresh=2).head()

In [None]:
ax1 = sns.set_style(style=None, rc=None)
fig, ax1 = plt.subplots(figsize=(6, 6))
_ = sns.boxplot(data=df_age, ax=ax1)

### Plots

In [None]:
def set_theme(subplot_labels=False):
    sns.set_theme()
    sns.set_style("ticks", {"axes.grid": True})
    sns.set_theme(rc={
        "figure.figsize": (11.7, 8.27),
        "font.size": 20,
        "axes.titlesize": 20,
        "axes.labelsize": 12
    },
                  style="white")
    # sns.set_theme(rc={"figure.figsize": (11.7, 8.27)}, style="white")

    fig, axes = plt.subplots(nrows=2,
                             ncols=3,
                             figsize=(27, 16),
                             sharey=False,
                             squeeze=False)
    plt.subplots_adjust(
        left=0.125,  # the left side of the subplots of the figure
        right=0.9,  # the right side of the subplots of the figure
        bottom=0.1,  # the bottom of the subplots of the figure
        top=0.9,  # the top of the subplots of the figure
        wspace=
        0.3,  # the amount of width reserved for blank space between subplots
        hspace=
        0.3,  # the amount of height reserved for white space between subplots
    )
    if subplot_labels:
        for i in range(axes.size):
            ax = axes.flatten()[i]
            ax.text(-0.1,
                    1.15,
                    chr(ord("A") + i),
                    transform=ax.transAxes,
                    fontsize=16,
                    fontweight="bold",
                    va="top",
                    ha="right")
    return fig, axes

In [None]:
import warnings

font_subheader = {"size": 30, "weight": "bold"}
default_palette = sns.color_palette("pastel", 1)
default_ticksize = 20


def plot_audiogram_drops(ax,
                         palette=default_palette,
                         fontdict=font_subheader,
                         ticksize=default_ticksize):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bp = sns.barplot(x=df_drops["# Frequencies Missing"],
                         y=df_drops["% Audiograms"],
                         data=df_drops,
                         palette=palette,
                         legend=False,
                         ax=ax)

    _ = ax.set(ylim=[0, 24])
    reset_xticklabels(ax, rotation=0, fontsize=ticksize)
    _ = ax.tick_params(axis="y", labelsize=ticksize)

    _ = ax.set_ylabel("% Audiograms", fontdict=fontdict)
    _ = ax.set_xlabel("# Frequencies Missing", fontdict=fontdict)

    return bp

In [None]:
def plot_age_hld(ax,
                 palette=default_palette,
                 fontdict=font_subheader,
                 ticksize=default_ticksize):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bp = sns.boxplot(data=df_age, ax=ax, width=0.5, palette=palette)
    _ = ax.set_xlabel("Feature", fontdict=fontdict)
    _ = ax.set_ylabel("Years", fontdict=fontdict)
    _ = ax.set(ylim=[0, 109])
    reset_xticklabels(ax, rotation=0, fontsize=ticksize)
    _ = ax.tick_params(axis="y", labelsize=ticksize)
    return bp

In [None]:
def plot_gender(ax,
                palette=default_palette,
                fontdict=font_subheader,
                ticksize=default_ticksize):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bp = sns.barplot(x=df_sex["Sex"],
                         y=df_sex["# Audiograms"],
                         data=df_sex,
                         palette=palette,
                         ax=ax)
    reset_xticklabels(ax, rotation=0, fontsize=ticksize)
    _ = ax.tick_params(axis="y", labelsize=ticksize)
    # for container in ax.containers:
    #     _ = ax.bar_label(container, label_type="center", fmt="%.0f")

    _ = ax.set_xlabel("Sex", fontdict=fontdict)
    _ = ax.set_ylabel("# Audiograms", fontdict=fontdict)
    _ = ax.set_ylim((0, 4800))
    return bp

In [None]:
def plot_missing(ax,
                 palette=default_palette,
                 fontdict=font_subheader,
                 ticksize=default_ticksize):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        _ = sns.barplot(x=df_missing["Feature"],
                        y=df_missing["% Missing"],
                        data=df_missing,
                        palette=palette,
                        ax=ax)

    _ = ax.set(ylim=[0, 99])
    reset_xticklabels(ax, rotation=90, fontsize=ticksize)
    _ = ax.tick_params(axis="y", labelsize=ticksize)

    _ = ax.set_ylabel("% Missing", fontdict=fontdict)
    _ = ax.set_xlabel("Feature", fontdict=fontdict, labelpad=2)

In [None]:
def plot_audiogram(ax,
                   palette=default_palette,
                   fontdict=font_subheader,
                   ticksize=default_ticksize):
    _ = ax.set_ylim(130, -12)
    _ = ax.set_ylabel("Hearing level (dB)", fontdict=fontdict)
    _ = ax.set_xlabel("Frequency", fontdict=fontdict, labelpad=20)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bp = sns.boxplot(data=df_ag, palette=palette, ax=ax)
    reset_xticklabels(ax, rotation=90, fontsize=ticksize)
    _ = ax.tick_params(axis="y", labelsize=ticksize)
    return bp

In [None]:
def plot_etiology(ax,
                  palette=default_palette,
                  fontdict=font_subheader,
                  ticksize=default_ticksize):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bp = sns.barplot(x=df_eti["Etiology"],
                         y=df_eti["# Audiograms"],
                         data=df_eti,
                         palette=palette,
                         ax=ax)

    _ = ax.set_ylim((0, ax.get_ylim()[1]))  # 5900
    reset_xticklabels(ax, rotation=90, fontsize=ticksize)
    _ = ax.tick_params(axis="y", labelsize=ticksize)
    _ = ax.set_ylabel("# Audiograms", fontdict=fontdict)
    _ = ax.set_xlabel("Etiology", fontdict=fontdict, labelpad=-11)
    return bp

In [None]:
fig, axes = set_theme()
_ = plot_audiogram_drops(axes[0, 0])
_ = plot_age_hld(axes[0, 1])
_ = plot_gender(axes[0, 2])
_ = plot_missing(axes[1, 0])
_ = plot_audiogram(axes[1, 1])
_ = plot_etiology(axes[1, 2])

for n, ax in enumerate(axes.flat):
    _ = ax.text(-0.09,
                0.98, (string.ascii_lowercase[n] + ")"),
                transform=ax.transAxes,
                size=35,
                weight="bold")

## Table of missing value percents

In [None]:
dataset_nullity = df_r.isnull()
pd.DataFrame(dataset_nullity.sum()).rename(columns={
    "index": "Feature",
    0: "# Missing"
}).transpose()

As a percentage

In [None]:
missing_values_percent = pd.DataFrame(
    (dataset_nullity.mean() * 100)).reset_index()
missing_values_percent.rename(columns={
    "index": "Feature",
    0: "% Missing"
},
                              inplace=True)
missing_values_percent.round(2).transpose()

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=1,
                         figsize=(10, 8),
                         sharey=False,
                         squeeze=False)

sns.set_theme()
palette = sns.color_palette("bright", missing_values_percent.shape[0])
sns.set_style("ticks", {"axes.grid": True})

for ax in axes.ravel():
    ax = sns.barplot(x=missing_values_percent["Feature"],
                     y=missing_values_percent["% Missing"],
                     data=missing_values_percent)
    _ = ax.set(ylim=[0, 100], )
    reset_xticklabels(ax, rotation=30)
    for container in ax.containers:
        _ = ax.bar_label(container, label_type="edge", fmt="%.1f")

## Percentage of audiograms with X number dropped

In [None]:
num_drop_weights, col_drop_weights = find_parent_frequency(df_ag,
                                                           min_col=1,
                                                           verbose=0)

df_drops = pd.DataFrame([np.round(x * 100, 1)
                         for x in num_drop_weights]).reset_index()
df_drops.rename(columns={
    "index": "# dropped",
    0: "% of audiograms"
},
                inplace=True)
df_drops.transpose()

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=1,
                         figsize=(10, 8),
                         sharey=False,
                         squeeze=False)

sns.set_theme()
palette = sns.color_palette("bright", missing_values_percent.shape[0])
sns.set_style("ticks", {"axes.grid": True})

for ax in axes.ravel():
    ax = sns.barplot(x="# dropped", y="% of audiograms", data=df_drops)
    _ = ax.set(ylim=[0, 25], )
    reset_xticklabels(ax, rotation=0)
    for container in ax.containers:
        _ = ax.bar_label(container, label_type="edge", fmt="%.1f")

### ...