# Overview

In this example, we are continue visualize data with Histograms by using bokeh. Please note that we use two different glyph `patch()` and `quad()`. Please check the detail of them through the link in the code comment.

In [1]:
from bokeh.io import output_notebook

# initial the js
output_notebook()

In [2]:
import os
import pandas as pd

os.environ['DATASET']='/kaggle/input/titanic/train_and_test2.csv'

df=pd.read_csv(os.getenv('DATASET'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passengerid  1309 non-null   int64  
 1   Age          1309 non-null   float64
 2   Fare         1309 non-null   float64
 3   Sex          1309 non-null   int64  
 4   sibsp        1309 non-null   int64  
 5   zero         1309 non-null   int64  
 6   zero.1       1309 non-null   int64  
 7   zero.2       1309 non-null   int64  
 8   zero.3       1309 non-null   int64  
 9   zero.4       1309 non-null   int64  
 10  zero.5       1309 non-null   int64  
 11  zero.6       1309 non-null   int64  
 12  Parch        1309 non-null   int64  
 13  zero.7       1309 non-null   int64  
 14  zero.8       1309 non-null   int64  
 15  zero.9       1309 non-null   int64  
 16  zero.10      1309 non-null   int64  
 17  zero.11      1309 non-null   int64  
 18  zero.12      1309 non-null   int64  
 19  zero.1

In [3]:
df=df.sample(300, random_state=420)
df.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
554,555,22.0,7.775,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
402,403,21.0,9.825,1,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
461,462,34.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1227,1228,32.0,13.0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,2.0,0,0,0
796,797,49.0,25.9292,1,0,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1


In [4]:
ages = df.Age.dropna()
ages

554     22.0
402     21.0
461     34.0
1227    32.0
796     49.0
        ... 
731     11.0
479      2.0
168     28.0
34      28.0
593     28.0
Name: Age, Length: 300, dtype: float64

# Plotting

In [5]:
from bokeh.plotting import figure, show
import numpy as np

def plot_hist(df, step, title):
    """
    Create a histogram plot using Bokeh
    
    Args:
        df (array-like): The data to be plotted as a histogram.
        step (float): The width of each bin in the histogram.
        title (str): The title of the plot.
    """
    
    if not all(isinstance(val, (int, float)) for val in df):
        raise ValueError("Input data contains invalid values. Expected numerical values.")
        
    bins=np.arange(0, 80, step) # create bins
    hist, edges=np.histogram(df, bins=bins) # create histogram and bin edges
    
    p=figure(
        title=title,
        height=300,
        width=400,
        toolbar_location=None,
        x_axis_label="age (years)",
        y_axis_label="count",
    )
    
    # https://docs.bokeh.org/en/latest/docs/reference/plotting/figure.html#bokeh.plotting.figure.quad
    p.quad(
        top=hist,
        bottom=0,
        left=edges[:-1],
        right=edges[1:],
        fill_color="#52A9EB",
        line_color="white",
    )
    
    # plot customization

    # customise x-axis
    p.x_range.start = 0
    p.xaxis.ticker = [0, 20, 40, 60]
    p.xgrid.grid_line_color = None
    p.xaxis.axis_line_color = None
    p.xaxis.major_tick_line_color = "gray"
    p.xaxis.major_tick_out = 2

    # customise y-axis
    p.y_range.start = 0
    p.yaxis.minor_tick_out = 0
    p.yaxis.axis_line_color = None
    p.yaxis.major_tick_line_color = "gray"
    p.yaxis.major_tick_out = 0
    p.yaxis.major_tick_in = 0

    return p

single=plot_hist(ages, 5, "Demo Histograms")

show(single)

# Show Multiple Histograms

In [6]:
from bokeh.layouts import gridplot

# plot four histograms and display in a 2x2 grid
one = plot_hist(ages, 1, "a")
three = plot_hist(ages, 3, "b")
five = plot_hist(ages, 5, "c")
fifteen = plot_hist(ages, 15, "d")

layout = gridplot([[one, three], [five, fifteen]])

show(layout)

# Density plots

The plots in this sub-section represent the kernel density estimate of the age distribution of passengers on the Titanic. The `patch()` glyph is used to create the density plots.

## Data Preparation

In [7]:
values=ages.values
positions=np.linspace(-10, 80, 1000)

# Plotting

In [8]:
from sklearn.neighbors import KernelDensity

def plot_kde(kernel, bandwidth, title):
    """
    Create a kernel density estimation (KDE) plot using Bokeh.

    Args:
        kernel (str): The type of kernel to be used in the KDE estimation.
        bandwidth (float): The bandwidth parameter for the KDE.
        title (str): The title of the plot.

    Returns:
        bokeh.plotting.figure.Figure: The Bokeh figure object representing the KDE plot.

    Raises:
        ValueError: If the input values for `kernel` or `bandwidth` are invalid.
    """
    # data validation
    supported_kernels = [
        "gaussian",
        "tophat",
        "epanechnikov",
        "exponential",
        "linear",
        "cosine",
    ]

    if kernel not in supported_kernels:
        raise ValueError(
            f"Invalid kernel: '{kernel}'. Supported kernels are: {supported_kernels}"
        )

    if bandwidth <= 0:
        raise ValueError("Bandwidth must be a positive value.")

    # function implementation

    # create kde object and fit object into 'values' parameter
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(values[:, np.newaxis])

    # calculate log-density estimation (log_dens) at each position using the 'score_samples' method
    log_dens = kde.score_samples(positions[:, np.newaxis])

    # create figure object
    p = figure(
        title=title,  # plot title
        height=300,  # plot height
        width=600,  # plot width
        x_axis_label="age (years)",
        y_axis_label="density",
    )

    # draw filled area on the figure using `patch` glyph
    # https://docs.bokeh.org/en/latest/docs/reference/plotting/figure.html#bokeh.plotting.figure.patch
    p.patch(
        positions,  # x-axis coordinates
        np.exp(log_dens),  # y-axis coordinates
        fill_alpha=0.9,  # fill transparency
        fill_color="#52A9EB",
        line_color="black",
    )

    # customize x-axis
    p.x_range.start = 0
    p.xaxis.ticker = [0, 20, 40, 60]
    p.xgrid.grid_line_color = None
    p.xaxis.axis_line_color = None
    p.xaxis.major_tick_line_color = "gray"
    p.xaxis.major_tick_out = 2

    # customize y-axis
    p.y_range.start = 0
    p.yaxis.ticker = [0, 0.01, 0.02, 0.03, 0.04]
    p.yaxis.minor_tick_out = 0
    p.yaxis.axis_line_color = None
    p.yaxis.major_tick_line_color = "gray"
    p.yaxis.major_tick_out = 0
    p.yaxis.major_tick_in = 0

    return p

# plot a single density plot
single = plot_kde("gaussian", 2, "Demo2 Density plot")

show(single)

In [9]:
# plot four density plots and arrange them in a 2x2 grid

half = plot_kde("gaussian", 0.5, "a")
two = plot_kde("gaussian", 2, "b")
five = plot_kde("gaussian", 5, "c")
rect = plot_kde("tophat", 2, "d")

layout = gridplot([[half, two], [five, rect]])

show(layout)

# Reference

* https://clauswilke.com/dataviz/histograms-density-plots.html#visualizing-a-single-distribution
* https://bokeh.github.io/dataviz-fundamentals/03-visualize-single-distribution-histograms.html