In [1]:
import json
import random
import numpy as np
import pandas as pd
from pprint import pprint
#
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
#
from xgboost import XGBClassifier
from xgboost import XGBModel
from xgboost import Booster
#
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

import PDSUtilities
from PDSUtilities.xgboost import plot_importance
from PDSUtilities.xgboost import plot_tree
from PDSUtilities.plotly import ColorblindSafeColormaps
# from PDSUtilities.pandas import plot_histograms
# print("Using PDSUtilities version ", PDSUtilities.__version__)

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# # Colorblindness friendly colours...
# # It is important to make our work
# # as accessible as possible...
# COLORMAP = ["#005ab5", "#DC3220"]
# Labels for plotting...
LABELS = {
    "Sex": "Sex",
    "Age": "Age",
    "MaxHR": "Max HR",
    "OldPeak": "Old Peak",
    "STSlope": "ST Slope",
    "RestingBP": "Rest. BP",
    "FastingBS": "Fast. BS",
    "RestingECG": "Rest. ECG",
    "Cholesterol": "Cholesterol",
    "HeartDisease": "Heart Disease",
    "ChestPainType": "Chest Pain",
    "ExerciseAngina": "Ex. Angina",
}
# Random seed for determinism...
SEED = 395147


In [2]:
# Loading the data from the csv file...
df = pd.read_csv("./data/heart.csv")
df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# Fix the egregious column naming error...
df = df.rename(columns = {"ST_Slope": "STSlope", "Oldpeak": "OldPeak"})

# Always test these things...
assert len(df["STSlope"]) > 0, "Ruh roh! ST_Slope is still terribly mistaken!"
assert len(df["OldPeak"]) > 0, "Ruh roh! Oldpeak is still terribly mistaken!"

# Convert target to categorical
target = pd.Categorical(df["HeartDisease"])
df["HeartDisease"] = target.codes

print("Datatypes")
print("---------")
print(df.dtypes)


Datatypes
---------
Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
OldPeak           float64
STSlope            object
HeartDisease         int8
dtype: object


In [4]:
# Break the columns into two groupings...
categorical_columns = [column for column in df.columns if df[column].dtypes == object]
numerical_columns   = [column for column in df.columns if df[column].dtypes != object]

if "HeartDisease" in numerical_columns:
    numerical_columns.remove("HeartDisease")

assert "HeartDisease" not in numerical_columns, "Ruh roh! HeartDisease is still in numerical_columns!"

print("Categorical Columns: ", categorical_columns)
print("  Numerical Columns: ", numerical_columns)


Categorical Columns:  ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'STSlope']
  Numerical Columns:  ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'OldPeak']


In [5]:
def set_column_value_to_normal_distribution(df, column, value):
    # Compute the column's mean and standard deviation
    # after removing rows whose column matches value...
    mean_value = df[df[column] != value][column].mean()
    std_value  = df[df[column] != value][column].std()
    # Create a random number generator...
    rng = np.random.default_rng(SEED)
    # Now set the column of those rows to a
    # random sample from a normal distribution...
    df[column] = df[column].apply(
        lambda x : rng.normal(mean_value, std_value) if x == value else x
    )
    return df

df = set_column_value_to_normal_distribution(df, "RestingBP"  , 0)
df = set_column_value_to_normal_distribution(df, "Cholesterol", 0)

# Always test...
assert len(df[df["RestingBP"  ] == 0]) == 0, "Ruh roh! One or more patients has crashed again!"
assert len(df[df["Cholesterol"] == 0]) == 0, "Ruh roh! One or more patients has crashed again!"


In [37]:
# Copyright 2022 by Contributors

import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from PDSUtilities.plotly import ColorblindSafeColormaps

def get_categories_and_counts(df, column, target, value):
    categories = df[column].unique()
    df = df[df[target] == value]
    counts = [df[df[column] == category][column].count() for category in categories]
    return categories, counts

# Plotly is too smart and converts strings to numbers when
# possible but we're smarter: wrap numbers in <span></span>!
def to_string(value):
    if isinstance(value, str):
        return value
    return f"<span>{value}</span>"

def get_categories_and_counts(df, column, target = None, value = None):
    categories = df[column].unique()
    if target is not None:
        df = df[df[target] == value]
    counts = [df[df[column] == category][column].count() for category in categories]
    categories = [to_string(category) for category in categories]
    return categories, counts

def get_width(index):
    WIDTHS = [0.0, 0.1, 0.23, 0.34, 0.46, 0.65, 0.8]
    width = WIDTHS[index] if index < len(WIDTHS) else WIDTHS[-1]
    return width

def apply_default(parameter, default):
    if parameter:
        return { **default, **parameter }
    return default

def get_bins(bins, column):
    if isinstance(bins, int):
        return bins
    return bins.get(column, 0)

def get_histogram(x, color, show_legend, name, cumulative, bins):
    return go.Histogram(
        x = x, marker_color = color, showlegend = show_legend,
        name = name, cumulative_enabled = cumulative, nbinsx = bins,
        legendgroup = name,
    )

def get_bar(categories, counts, color, show_legend, name):
    return go.Bar(
        x = categories, y = counts,
        marker_color = color,
        showlegend = show_legend,
        name = name,
        width = get_width(len(categories)),
        legendgroup = name,
    )

def get_rcwh(rows, cols, width, height, columns, values):
    columns = len(columns)
    w, h = 250, 200
    if rows is None and cols is None:
        cols = max(2, min(5, int(np.ceil(np.sqrt(columns)))))
        rows = int(np.ceil(columns/cols))
    elif cols is None:
        cols = int(np.ceil(columns/rows))
    elif rows is None:
        rows = int(np.ceil(columns/cols))
    if width is None:
        if cols > 2 or cols == 2 and len(values) < 4:
            width = w*cols
        else:
            width = w*(cols + 1)
    if height is None:
        if cols > 2 or cols == 2 and len(values) < 4:
            height = 100 + h*rows
        else:
            height = h*rows
    return rows, cols, width, height

def plot_histograms(df, target = None, rows = None, cols = None, width = None, height = None,
    title = None, cumulative = None, barmode = "stack", opacity = 0.65, bins = 0,
    hovermode = None, template = None, colors = 0, font = {}, title_font = {}, legend_font = {}):
    DEFAULT_FONT = {
        'family': "Verdana, Helvetica, Verdana, Calibri, Garamond, Cambria, Arial",
        'size': 14,
        'color': "#000000"
    }
    font = apply_default(font, DEFAULT_FONT)
    legend_font = apply_default(legend_font, font)
    title_font = apply_default(title_font,
        apply_default({ 'size': font.get('size', 16) + 4 }, font)
    )
    colors = 0 if colors is None else colors
    colormaps = ColorblindSafeColormaps()
    colors = colormaps.get_colors(colors)
    if hovermode is None:
        hovermode = "x unified"
    #
    values = [] if target is None else [value for value in df[target].unique()]
    columns = [column for column in df.columns if column != target]
    if target is not None and target in columns:
        columns.remove(target)
    rows, cols, width, height = get_rcwh(rows, cols, width, height, columns, values)
    fig = make_subplots(rows = rows, cols = cols,
        horizontal_spacing = 0.25/cols,
        vertical_spacing = 0.37/rows,
        subplot_titles = columns,
    )
    for index, column in enumerate(columns):
        for value in values:
            name = f"{target} = {value}"
            max_bins = get_bins(bins, column)
            color = colors[values.index(value) % len(colors)]
            if df[column].dtypes == object or len(df[column].unique()) <= len(colors):
                categories, counts = get_categories_and_counts(df, column, target, value)
                trace = get_bar(categories, counts, color, index == 0, name)
                fig.append_trace(trace, 1 + index // cols, 1 + index % cols)
            else:
                x = df[df[target] == value][column]
                trace = get_histogram(x, color, index == 0, name, cumulative, max_bins)
                fig.append_trace(trace, 1 + index // cols, 1 + index % cols)
        if target is None:
            if df[column].dtypes == object or len(df[column].unique()) <= len(colors):
                categories, counts = get_categories_and_counts(df, column)
                trace = get_bar(categories, counts, colors[0], False, column)
                fig.append_trace(trace, 1 + index // cols, 1 + index % cols)
            else:
                trace = get_histogram(df[column], colors[0], False, column, cumulative, max_bins)
                fig.append_trace(trace, 1 + index // cols, 1 + index % cols)
    # barmode = ['stack', 'group', 'overlay', 'relative']
    # barmode = "stack"
    if barmode == "overlay":
        fig.update_traces(opacity = opacity)
    fig.update_annotations(font = font)
    fig.update_traces(marker_line_color = "#000000")
    fig.update_traces(marker_line_width = 0.5)
    if title is not None and isinstance(title, str):
        title = { 'text': title, 'x': 0.5, 'xanchor': "center" }
    if title is not None:
        fig.update_layout(title = title)
    if template is not None:
        fig.update_layout(template = template)
    if cols > 2 or cols == 2 and len(values) < 4:
        fig.update_layout(width = width, height = height, barmode = barmode,
            legend = dict(
                orientation = "h",
                yanchor = "bottom",
                y = 1.0 + 2.0*cols/100.0,
                xanchor = "center",
                x = 0.5
            ),
            margin = { 't': 160 },
        )
    fig.update_layout(hovermode = hovermode)
    fig.update_layout(width = width, height = height, barmode = barmode,
        font = font, title_font = title_font, legend_font = legend_font,
        # margin = { 't': 160 },
        # bargap = 0.2, # gap between bars of adjacent location coordinates
        # bargroupgap = 0*0.2, # gap between bars of the same location coordinates
    )
    # This is literally the dumbest thing I've seen in years...
    # This puts space between the ticks and tick labels. SMFH.
    fig.update_yaxes(ticksuffix = " ")
    return fig


In [42]:
###

# fig = plot_histograms(df, target = None)
# fig.show()

# fig = plot_histograms(df, cols = 3, target = "HeartDisease", title = "Heart Disease Dataset Histograms", template = "presentation")
# fig.show()

# fig = plot_histograms(df, height = 800, cols = 4, target = "HeartDisease", template = "simple_white",
#     cumulative = True, barmode = "group")
# fig.show()

# fig = plot_histograms(df, width = 1100, height = 700, cols = 4, target = "HeartDisease", title = "Heart Disease Dataset Histograms", template = "presentation",
#                      barmode = "overlay")
# fig.show()

fig = plot_histograms(df, target = "ChestPainType", bins = 25, colors = "Vibrant", barmode = "group", title = "Dataset Histograms", template = "presentation")
fig.show()

fig = plot_histograms(df, target = "ChestPainType", bins = {"Age": 10, "MaxHR": 20, "Cholesterol": 25}, colors = "Vibrant", barmode = "group", title = "Dataset Histograms", template = "presentation")
fig.show()

fig = plot_histograms(df, target = "ChestPainType", bins = 20, title = "Heart Disease Dataset Histograms Grouped by Chest Pain Type", template = "presentation")
fig.show()

# fig = plot_histograms(df, cols = 3, target = "HeartDisease", colors = -1, barmode = "overlay", title = "Heart Disease Dataset Grouped by Chest Pain Type", template = "simple_white")
# fig.show()
