In [1]:
import json
import random
import numpy as np
import pandas as pd
from pprint import pprint
#
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
#
from xgboost import XGBClassifier
from xgboost import XGBModel
from xgboost import Booster
#
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

import PDSUtilities
from PDSUtilities.xgboost import plot_importance
from PDSUtilities.xgboost import plot_tree
from PDSUtilities.plotly import ColorblindSafeColormaps

# from PDSUtilities.pandas import plot_histograms
# print("Using PDSUtilities version ", PDSUtilities.__version__)

# from plotly.offline import init_notebook_mode, iplot
# init_notebook_mode(connected = True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# # Colorblindness friendly colours...
# # It is important to make our work
# # as accessible as possible...
# COLORMAP = ["#005ab5", "#DC3220"]
# Labels for plotting...
LABELS = {
    "Sex": "Sex",
    "Age": "Age",
    "MaxHR": "Max HR",
    "OldPeak": "Old Peak",
    "STSlope": "ST Slope",
    "RestingBP": "Rest. BP",
    "FastingBS": "Fast. BS",
    "RestingECG": "Rest. ECG",
    "Cholesterol": "Cholesterol",
    "HeartDisease": "Heart Disease",
    "ChestPainType": "Chest Pain",
    "ExerciseAngina": "Ex. Angina",
}
# Random seed for determinism...
SEED = 395147

# Template settings for plotly...
layout_axis = dict(
    mirror=True,
    ticks="outside",
    showline=True,
    title_standoff = 5,
    showgrid = True,
)
pio.templates["DrJohnWagner"] = go.layout.Template(
    layout_xaxis = layout_axis,
    layout_yaxis = layout_axis,
    layout_title_font_size = 18,
    layout_font_size = 16,
)
pio.templates.default = "simple_white+DrJohnWagner"

In [2]:
# Loading the data from the csv file...
df = pd.read_csv("./data/heart.csv")
df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# Fix the egregious column naming error...
df = df.rename(columns = {"ST_Slope": "STSlope", "Oldpeak": "OldPeak"})

# Always test these things...
assert len(df["STSlope"]) > 0, "Ruh roh! ST_Slope is still terribly mistaken!"
assert len(df["OldPeak"]) > 0, "Ruh roh! Oldpeak is still terribly mistaken!"

# Convert target to categorical
target = pd.Categorical(df["HeartDisease"])
df["HeartDisease"] = target.codes

print("Datatypes")
print("---------")
print(df.dtypes)


Datatypes
---------
Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
OldPeak           float64
STSlope            object
HeartDisease         int8
dtype: object


In [4]:
# Break the columns into two groupings...
categorical_columns = [column for column in df.columns if df[column].dtypes == object]
numerical_columns   = [column for column in df.columns if df[column].dtypes != object]

if "HeartDisease" in numerical_columns:
    numerical_columns.remove("HeartDisease")

assert "HeartDisease" not in numerical_columns, "Ruh roh! HeartDisease is still in numerical_columns!"

print("Categorical Columns: ", categorical_columns)
print("  Numerical Columns: ", numerical_columns)


Categorical Columns:  ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'STSlope']
  Numerical Columns:  ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'OldPeak']


In [5]:
def set_column_value_to_normal_distribution(df, column, value):
    # Compute the column's mean and standard deviation
    # after removing rows whose column matches value...
    mean_value = df[df[column] != value][column].mean()
    std_value  = df[df[column] != value][column].std()
    # Create a random number generator...
    rng = np.random.default_rng(SEED)
    # Now set the column of those rows to a
    # random sample from a normal distribution...
    df[column] = df[column].apply(
        lambda x : rng.normal(mean_value, std_value) if x == value else x
    )
    return df

df = set_column_value_to_normal_distribution(df, "RestingBP"  , 0)
df = set_column_value_to_normal_distribution(df, "Cholesterol", 0)

# Always test...
assert len(df[df["RestingBP"  ] == 0]) == 0, "Ruh roh! One or more patients has crashed again!"
assert len(df[df["Cholesterol"] == 0]) == 0, "Ruh roh! One or more patients has crashed again!"


In [6]:
# Copyright 2022 by Contributors

import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
from pandas.api.types import is_integer_dtype
from PDSUtilities.plotly import apply_default
from PDSUtilities.plotly import get_font
from PDSUtilities.plotly import ColorblindSafeColormaps

def get_line(df, target, colors):
    line = dict(
        color = colors[0],
        showscale = False,
    )
    if target is not None:
        values = df[target]
        if df[target].dtypes == 'O':
            values = df[target].astype('category').cat.codes
        line['color'] = values
        line['colorscale'] = [
            colors[index] for index in range(len(np.unique(values)))
        ]
    return line

def get_dimension(df, column, labels):
    dimension = dict(
        values = df[column],
        label = labels.get(column, column),
        name = column,
    )
    if df[column].dtypes == 'O':
        categories = df[column].astype('category').cat
        dimension['values'] = categories.codes
        dimension['tickvals'] = np.sort(np.unique(categories.codes))
        dimension['ticktext'] = categories.categories
    elif is_integer_dtype(df[column]) and len(df[column].unique()) <= 8:
        dimension['tickvals'] = np.sort(df[column].unique())
        dimension['ticktext'] = np.sort(df[column].unique())
    return dimension

# TODO: #8 add template and misc args, comments and update README.md for plot_parallel functions...
def plot_parallel_coordinates(df, target = None, columns = None, labels = {},
    width = None, height = None, title = None, colors = 0,
    font = {}, tick_font = {}, label_font = {}, title_font = {}):
    #
    default_font = get_font()
    font = apply_default(default_font, font)
    tick_font = apply_default(font, tick_font)
    label_font = apply_default(font, label_font)
    title_font = apply_default(
        apply_default(font, { 'size': font.get('size', 16) + 4 }),
        title_font
    )
    #
    colors = 0 if colors is None else colors
    if isinstance(colors, int):
        colormaps = ColorblindSafeColormaps()
        colors = colormaps.get_colors(colors)
    #
    #
    if columns is None:
        columns = [column for column in df.columns if df[column].dtypes != 'O']
    if not isinstance(columns, list):
        columns = [column for column in columns]
    if target is not None and target not in columns:
        columns = [target] + columns
    #
    if target is not None and target not in columns:
        columns = [target] + columns
    fig = go.Figure(go.Parcoords(
        dimensions = list([
            get_dimension(df, column, labels)
            for column in columns
        ]),
        line = get_line(df, target, colors),
        labelfont = label_font,
        tickfont = tick_font,
        # This eliminates the range! Set color to background!
        rangefont = { 'size': 1, 'color': "#FFFFFF" }
    ))
    if title is not None and isinstance(title, str):
        title = { 'text': title, 'x': 0.5, 'xanchor': "center" }
    if title is not None:
        fig.update_layout(title = title, title_font = title_font)
    if width is not None:
        fig.update_layout(width = width)
    if height is not None:
        fig.update_layout(height = height)
    # if template is not None:
    #     fig.update_layout(template = template)
    fig.update_layout(font = font)
    return fig

In [404]:
fig = plot_parallel_coordinates(df.iloc[::4, :], target = "ChestPainType",
    # columns = df.columns,
    title = "Heart Disease Dataset Numerical Columns", colors = 1, font = { 'size': 14 })
fig.show()
# fig = plot_histograms(df, target = "ChestPainType", bins = 25, colors = "Vibrant", title = "Dataset Histograms", template = "presentation")
# fig.show()
# fig = plot_histograms(df, target = "ChestPainType", bins = {"Age": 10, "MaxHR": 20, "Cholesterol": 25}, colors = "Vibrant", barmode = "group", title = "Dataset Histograms", template = "presentation")
# fig.show()
# fig = plot_histograms(df, cols = 3, target = "HeartDisease", colors = -1, barmode = "overlay", title = "Heart Disease Dataset Grouped by Chest Pain Type", template = "simple_white")
# fig.show()


DTYPE  category
[0 1 2 3]
[0 1 2 3]
Index(['ASY', 'ATA', 'NAP', 'TA'], dtype='object')


In [365]:
# Copyright 2022 by Contributors

import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from PDSUtilities.plotly import apply_default
from PDSUtilities.plotly import get_font
from PDSUtilities.plotly import ColorblindSafeColormaps

def get_line(df, target, colors):
    line = dict(
        color = colors[0],
        showscale = False,
    )
    if target is not None:
        values = df[target]
        if df[target].dtypes == 'O':
            values = df[target].astype('category').cat.codes
        line['color'] = values
        line['colorscale'] = [
            colors[index % len(colors)] for index in range(len(np.unique(values)))
        ]
    return line

def plot_parallel_categories(df, target = None, columns = None, labels = {},
    width = None, height = None, title = None, colors = 0,
    font = {}, tick_font = {}, label_font = {}, title_font = {}):
    default_font = get_font()
    font = apply_default(default_font, font)
    tick_font = apply_default(font, tick_font)
    label_font = apply_default(font, label_font)
    title_font = apply_default(
        apply_default(font, { 'size': font.get('size', 16) + 4 }),
        title_font
    )
    colors = 0 if colors is None else colors
    if isinstance(colors, int):
        colormaps = ColorblindSafeColormaps()
        colors = colormaps.get_colors(colors)
    #
    if columns is None:
        columns = [column for column in df.columns if df[column].dtypes == 'O']
    if not isinstance(columns, list):
        columns = [column for column in columns]
    if target is not None and target not in columns:
        columns = [target] + columns
    #
    if target is not None:
        if target in columns:
            columns.remove(target)
        columns = [target] + columns
    fig = go.Figure(go.Parcats(
        dimensions = list([
            dict(
                values = df[column],
                label = labels.get(column, column),
                categoryorder = "category ascending",
            ) for column in columns
        ]),
        line = get_line(df, target, colors),
        labelfont = label_font,
        tickfont = tick_font,
    ))
    if title is not None and isinstance(title, str):
        title = { 'text': title, 'x': 0.5, 'xanchor': "center" }
    if title is not None:
        fig.update_layout(title = title, title_font = title_font)
    if width is not None:
        fig.update_layout(width = width)
    if height is not None:
        fig.update_layout(height = height)
    # if template is not None:
    #     fig.update_layout(template = template)
    fig.update_layout(font = font)
    return fig

In [366]:
fig = plot_parallel_categories(df.iloc[::4, :], target = "ChestPainType", #columns = df.columns,
    title = "Heart Disease Dataset Categorical Columns", colors = 1, font = { 'size': 12 })
fig.show()


In [141]:
age = np.trunc(5*(df["Age"] - df["Age"].min())/(df["Age"].max() - df["Age"].min()))
print(max([int(a) for a in age]))

5


In [431]:
import plotly.graph_objects as go
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/iris-data.csv')
index_vals = df['class'].astype('category').cat.codes

fig = go.Figure(data=go.Splom(
                dimensions=[dict(label='sepal length',
                                 values=df['sepal length']),
                            dict(label='sepal width',
                                 values=df['sepal width']),
                            dict(label='petal length',
                                 values=df['petal length']),
                            dict(label='petal width',
                                 values=df['petal width'])],
                showupperhalf=False, # remove plots on diagonal
                text=df['class'],
                marker=dict(color=index_vals,
                            showscale=False, # colors encode categorical variables
                            line_color='white', line_width=0.5)
                ))


fig.update_layout(
    title='Iris Data set',
    width=800,
    height=800,
)
fig.update_xaxes(mirror=True)
fig.update_yaxes(mirror=True)
fig.show()

In [451]:
index_vals = df["HeartDisease"].astype('category').cat.codes

columns = numerical_columns
# columns.remove("FastingBS")
dimensions = [ dict(label = column, values = df[column]) for column in columns]
fig = go.Figure(data = go.Splom(
	dimensions = dimensions,
	showupperhalf = False,
	diagonal_visible = False,
	text = df["HeartDisease"],
	marker = dict(
		color=index_vals,
		showscale=False, # colors encode categorical variables
		line_color='white',
		line_width=0.5
	)
))


fig.update_layout(
    title = 'Heart Disease Data Set',
    width = 800,
    height = 800,
)
fig.update_layout(template="plotly_white")
fig.update_layout(
	xaxis = {
    	'automargin': True,
		'gridcolor': 'black',
		'linecolor': 'black',
		'showgrid': True,
		'ticks': '',
		'title': { 'standoff': 15 },
		'zerolinecolor': 'red'
	},
	yaxis = {
    	'automargin': True,
		'gridcolor': 'black',
		'linecolor': 'black',
		'showgrid': True,
		'ticks': '',
		'title': { 'standoff': 15 },
		'zerolinecolor': 'red'
	}
)
# print(pio.templates["seaborn"])
fig.show()

In [187]:
# Copyright 2022 by Contributors

import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from PDSUtilities.plotly import apply_default
from PDSUtilities.plotly import get_font
from PDSUtilities.plotly import get_marker
from PDSUtilities.plotly import update_layout
from PDSUtilities.plotly import ColorblindSafeColormaps

def get_labels(labels):
    return { f"F{f}": labels[f] for f in range(len(labels))}

def get_colors(colors, default = 0):
    colors = default if colors is None else colors
    if isinstance(colors, int):
        colors = ColorblindSafeColormaps().get_colors(colors)
    if isinstance(colors, int):
        colors = ColorblindSafeColormaps().get_colors(colors)
    return colors

def get_numrical_columns(columns, target = None):
    if columns is None:
        columns = [column for column in df.columns if df[column].dtypes != 'O']
    if not isinstance(columns, list):
        columns = [column for column in columns]
    if target is not None and target in columns:
        columns.remove(target)
    return columns

def get_subtitile(correlations, columns, labels, row, col, precision, align = "middle"):
    BR = "<br />"
    col_label = labels.get(columns[col], columns[col])
    row_label = labels.get(columns[row], columns[row])
    correlation = f"<b>{np.round(correlations.iloc[row, col], precision)}</b>"
    if align == "middle":
        return "<span>" + col_label + BR + correlation + BR + row_label + "</span>"
    if align == "top":
        return "<span>" + correlation + BR + col_label + BR + row_label + "</span>"
    return "<span>" + col_label + BR + row_label + BR + correlation + "</span>"

def get_mean(values):
    return (0.5*(min(values) + max(values)))

def plot_correlations(df, target = None, columns = None, labels = {},
    width = None, height = None, title = None, precision = 4,
    template = None, colors = 0, marker = {},
    font = {}, tick_font = {}, label_font = {}, title_font = {}):
    #
    default_font = get_font()
    default_marker  = get_marker()
    font = apply_default(default_font, font)
    tick_font = apply_default(font, tick_font)
    label_font = apply_default(font, label_font)
    marker = apply_default(default_marker, marker)
    title_font = apply_default(
        apply_default(font, { 'size': font.get('size', 16) + 4 }),
        title_font
    )
    colors = get_colors(colors)
    columns = get_numrical_columns(columns, target)
    if isinstance(labels, list):
        labels = get_labels(labels)
    rows, cols = len(columns), len(columns)
    correlations = df[columns].corr()
    values = [] if target is None else [value for value in df[target].unique()]
    values = [] if target is None else df[target].unique()
    #
    fig = make_subplots(rows = rows, cols = cols,
        horizontal_spacing = 0.1/cols,
        vertical_spacing = 0.1/rows,
        shared_xaxes = True,
        shared_yaxes = True,
        # print_grid = True,
    )
    for row in range(rows):
        for col in range(row):
            for value in values:
                fig.append_trace(
                    go.Scatter(
                        x = df[df[target] == value][columns[col]],
                        y = df[df[target] == value][columns[row]],
                        mode = 'markers',
                        marker = get_marker(marker, color = colors[value]),
                        name = labels.get(target, target) + " = " + str(value),
                        legendgroup = target + " = " + str(value),
                        showlegend = row == 1 and col == 0,
                    ),
                    row + 1, col + 1
                )
            if target is None:
                fig.append_trace(
                    go.Scatter(
                        x = df[columns[col]],
                        y = df[columns[row]],
                        mode = 'markers',
                        marker = get_marker(marker, color = colors[0]),
                        name = columns[row] + "/" + columns[col],
                        showlegend = False,
                    ),
                    row + 1, col + 1
                )
            # Used to center correlation text in
            # the plot as plotly annotations...
            fig.append_trace(
                go.Scatter(
                    x = [get_mean(df[columns[col]])],
                    y = [get_mean(df[columns[row]])],
                    mode = 'markers',
                    marker = get_marker(marker, color = colors[0]),
                    showlegend = False,
                    name = columns[row] + "/" + columns[col],
                ),
                col + 1, row + 1
            )
    # Point axes in upper plots to the axes
    # in the corresponding lower plots...
    for row in range(rows):
        for col in range(row):
            # (row, col) corresponds to who we are pointing at...
            x, y = (rows - 1)*cols + col, row*cols
            # So (col, row) is who we are...
            fig.update_xaxes(matches = f"x{x+1}", row = col + 1, col = row + 1)
            fig.update_yaxes(matches = f"y{y+1}", row = col + 1, col = row + 1)
    # Place correlation text centered in
    # the plot as plotly annotations...
    fig.update_layout(annotations = [
        dict(
            x = get_mean(df[columns[col]]),
            y = get_mean(df[columns[row]]),
            xref = "x" + str(col*rows + row + 1),
            yref = "y" + str(col*rows + row + 1),
            text = get_subtitile(correlations, columns, labels, row, col, precision),
            showarrow = False,
        ) for row in range(rows) for col in range(row)
    ])
    for row in range(rows):
        fig.update_yaxes(
            title_text = labels.get(columns[row], columns[row]), row = row + 1, col = 1
        )
    for col in range(cols):
        fig.update_xaxes(
            title_text = labels.get(columns[col], columns[col]), row = rows, col = col + 1
        )
    for row in range(rows):
        for col in range(row, cols):
            fig.update_xaxes(showgrid = False, row = row + 1, col = col + 1)
            fig.update_yaxes(showgrid = False, row = row + 1, col = col + 1)
        for col in range(1, cols):
            fig.update_yaxes(ticks = "", row = row + 1, col = col + 1)
    for row in range(rows - 1):
        for col in range(cols):
            fig.update_xaxes(ticks = "", row = row + 1, col = col + 1)
    fig.update_xaxes(
        tickfont = tick_font, title_font = font, linecolor = "black",
        linewidth = 0.5, mirror = True, zeroline = False,
    )
    fig.update_yaxes(
        tickfont = tick_font, title_font = font, linecolor = "black",
        linewidth = 0.5, mirror = True, zeroline = False,
    )
    # #
    if target is not None:
        fig.update_layout(legend_itemsizing = 'constant')
        fig.update_layout(legend = dict(
            orientation = 'h', yanchor = 'top', xanchor = 'center', y = 1.07, x = 0.5
        ))
    fig = update_layout(fig, width = width, height = height, title = title, 
        title_font = title_font, font = font, template = template)
    return fig

In [188]:
columns = numerical_columns
if "FastingBS" in columns:
    columns.remove("FastingBS")
fig = plot_correlations(df, target = "HeartDisease", columns = columns, labels = LABELS, #template = "simple_white",
    title = "Heart Disease Dataset Correlations",
    font = {'size': 12}, width = 800, height = 800)
fig.update_yaxes(ticksuffix = " ")
fig.update_layout(template = "presentation")
fig.show()
# print(fig.layout)