# Empirial Distributions

In many applications of simulation, theoretical distributons may not be a good fit to the data; for example, if the data are bimodal. A number of empirical distribution options are provided by `sim-tools` to support modelling in these situations

* RawContinuousEmpirical

In [1]:
from sim_tools.distributions import (
    RawContinuousEmpirical,
    GroupedContinuousEmpirical,
    RawDiscreteEmpirical,
    DiscreteEmpirical,
    DistributionRegistry
)

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## 1. RawContinuousEmpirical

If a user has access to the raw sample data they can pass this to `RawContinuousEmpirical`.  This will creat#e a piecewise linear model where interpolation is used for gaps between data samples.  The maximum and minimum values of the sample are the bounds of the distribution.

### 1.1 A simple example

In [2]:
# Generate a simple dataset with 10 values representing length of stay in days
# Values chosen to be spread out to clearly show linear lines between points
simple_los = np.array([1.0, 3.5, 5.0, 7.2, 10.0, 12.5, 15.0, 18.0, 22.0, 30.0])

# Create a DataFrame for clarity
simple_hospital_data = pd.DataFrame({'Length of Stay (Days)': simple_los})

simple_hospital_data.describe()

Unnamed: 0,Length of Stay (Days)
count,10.0
mean,12.42
std,9.047873
min,1.0
25%,5.55
50%,11.25
75%,17.25
max,30.0


In [3]:
# Create the empirical distribution
simple_dist = RawContinuousEmpirical(data=simple_los, random_seed=42)

# Create an interactive sampling demonstration
fig = simple_dist.plot_ecdf_with_sampling(
    title="Hospital Length of Stay: Sampling Demonstration",
    xlabel="Length of Stay (Days)",
    ylabel="Probability",
    line_color='rgb(0, 128, 128)',
    width=800,
    height=550,
    u_steps=100,
)

# Display the plot
fig.show()


## 1.2 More complex data

We will generate four datasets to illustrate the distributon in use.  One is approx uniform , one right skewed (exponential), one bimodal (a mix of two normal distributions) and finally one with outliers.

In [14]:
# Create a dictionary of distribution configurations for batch creation
dist_configs = {
    # 1. Basic uniform distribution for Hospital A
    'uniform_dist': {
        'class_name': 'Uniform',
        'params': {
            'low': 0.5,
            'high': 45.5
        }
    },
    
    # 2. Exponential distribution for Hospital B
    'exponential_dist': {
        'class_name': 'Exponential',
        'params': {
            'mean': 5
        }
    },
    
    # 3. Normal distributions for bimodal Hospital C
    'normal_short': {
        'class_name': 'Normal',
        'params': {
            'mean': 3,
            'sigma': 1
        }
    },
    'normal_long': {
        'class_name': 'Normal',
        'params': {
            'mean': 20,
            'sigma': 6
        }
    },
    
    # 4. Distributions for the realistic Hospital D
    'lognormal_dist': {
        'class_name': 'Lognormal',
        'params': {
            'mean': 5.4,
            'stdev': 3.5
        }
    },
    'uniform_outliers': {
        'class_name': 'Uniform',
        'params': {
            'low': 30,
            'high': 45
        }
    }
}

# Create distributions using the DistributionRegistry create_batch method
# Setting only the main_seed parameter - the class handles individual seeds automatically
distributions = DistributionRegistry.create_batch(dist_configs, main_seed=42)

# Sample from distributions to create our hospital length of stay data
# 1. Hospital A - uniform distribution
los_uniform = distributions['uniform_dist'].sample(100)

# 2. Hospital B - exponential distribution with offset and clipping
los_skewed = distributions['exponential_dist'].sample(100) + 0.5
los_skewed = np.clip(los_skewed, 0.5, 45.5)

# 3. Hospital C - bimodal distribution (combining two normal distributions)
los_bimodal = np.concatenate([
    distributions['normal_short'].sample(70),  # Short stays centered around 3 days
    distributions['normal_long'].sample(30)    # Longer stays centered around 20 days
])
los_bimodal = np.clip(los_bimodal, 0.5, 45.5)  # Clip to our desired range

# 4. Hospital D - realistic distribution with outliers
los_realistic = np.concatenate([
    distributions['lognormal_dist'].sample(95),  # Main distribution
    distributions['uniform_outliers'].sample(5)  # Few outliers with very long stays
])
los_realistic = np.clip(los_realistic, 0.5, 45.5)  # Clip to our desired range

# Create a DataFrame to organize our data
hospital_data = pd.DataFrame({
    'Hospital A': los_uniform,
    'Hospital B': los_skewed,
    'Hospital C': los_bimodal,
    'Hospital D': los_realistic
})

# Display the first few rows
hospital_data.head()



Unnamed: 0,Hospital A,Hospital B,Hospital C,Hospital D
0,41.753487,1.471245,1.350601,3.69776
1,41.4944,0.939744,4.405999,1.694949
2,39.946663,2.91169,3.723471,3.606469
3,14.419328,1.255231,2.170934,9.838493
4,43.459523,9.797112,3.483869,5.028042


In [15]:
hospital_data.describe()

Unnamed: 0,Hospital A,Hospital B,Hospital C,Hospital D
count,100.0,100.0,100.0,100.0
mean,22.816191,6.210169,8.031861,6.72424
std,13.473663,5.504185,8.356952,6.686789
min,1.156456,0.524006,0.863552,1.558499
25%,10.534497,2.255529,2.709289,3.443588
50%,23.637873,4.358754,3.721741,4.683944
75%,33.63756,8.140663,13.543078,7.103679
max,45.348266,25.440116,32.305724,36.572435


In [12]:
# Create RawContinuousEmpirical objects for each dataset
distributions = {}
for hospital in hospital_data.columns:
    distributions[hospital] = RawContinuousEmpirical(
        data=hospital_data[hospital],
        random_seed=42
    )

# Example: Plot the ECDF for Hospital A (uniform distribution)
fig_a = distributions['Hospital A'].plot_ecdf(
    title="Hospital A: Length of Stay ECDF (Uniform Distribution)",
    xlabel="Length of Stay (Days)",
    ylabel="Probability"
)

# Example: Plot the ECDF for Hospital C (bimodal distribution)
fig_c = distributions['Hospital C'].plot_ecdf(
    title="Hospital C: Length of Stay ECDF (Bimodal Distribution)",
    xlabel="Length of Stay (Days)",
    ylabel="Probability",
    line_color='rgb(214, 39, 40)'
)

# Compare all hospitals in one figure
fig = make_subplots(rows=2, cols=2, 
                   subplot_titles=("Hospital A (Uniform)", 
                                   "Hospital B (Right-skewed)",
                                   "Hospital C (Bimodal)", 
                                   "Hospital D (Realistic with Outliers)"))

hospitals = list(hospital_data.columns)
colors = ['rgb(0, 116, 217)', 'rgb(255, 65, 54)', 
          'rgb(46, 204, 64)', 'rgb(255, 133, 27)']

for i, (hospital, color) in enumerate(zip(hospitals, colors)):
    row, col = i // 2 + 1, i % 2 + 1
    
    # Get the sorted data and probabilities from the distribution object
    dist = distributions[hospital]
    
    fig.add_trace(
        go.Scatter(
            x=dist.data,
            y=dist.probabilities,
            mode='lines+markers',
            line=dict(color=color),
            name=hospital
        ),
        row=row, col=col
    )

fig.update_layout(height=700, width=900, title_text="Comparison of Length of Stay ECDFs")
fig.update_xaxes(title_text="Length of Stay (Days)")
fig.update_yaxes(title_text="Probability")

fig.show()
