In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import stan # python3 -m pip install pystan
import nest_asyncio
nest_asyncio.apply()
import warnings

In [2]:
# Define file paths
numbers_path = "dataframes/numbers.csv"
percentages_path = "dataframes/precentages.csv"
death_rates_path = "dataframes/death_rates.csv"

In [3]:
# Read the percentages CSV file
df = pd.read_csv(percentages_path, delimiter=';', header=0)
df = df.head(22)
df = df.drop(columns="[All]")
df.dtypes

Year       object
[0]        object
[1-4]      object
[5-9]      object
[10-14]    object
[15-19]    object
[20-24]    object
[25-29]    object
[30-34]    object
[35-39]    object
[40-44]    object
[45-49]    object
[50-54]    object
[55-59]    object
[60-64]    object
[65-69]    object
[70-74]    object
[75-79]    object
[80-84]    object
[85+]      object
dtype: object

In [4]:
df = df.map(lambda x: float(str(x).replace(',', '.')) if isinstance(x, str) else x)
df = df.map(lambda x: pd.to_numeric(str(x).replace(',', '.'), errors='coerce'))
df["Year"] = df["Year"].astype(int)
df = df.set_index("Year")

In [5]:
df = df.T

In [6]:
data = dict(
    N = 19,
    Y = 22,
    accidentData = df.values,
    xpred = 2022)

In [7]:
file_path = "stan/separate.stan"

# Read the Stan code from the file
with open(file_path, "r") as file:
    stan_code = file.read()

posterior1 = stan.build(stan_code, data=data)

Building...



Building: found in cache, done.

In [8]:
fit_separate = posterior1.sample(num_chains=4, num_samples=1000)
separate_df = fit_separate.to_frame()  # pandas `DataFrame, requires pandas

Sampling:   0%
Sampling:   0% (1/8000)
Sampling:   0% (2/8000)
Sampling:   0% (3/8000)
Sampling:   0% (4/8000)
Sampling:   1% (103/8000)
Sampling:   3% (202/8000)
Sampling:   4% (301/8000)
Sampling:   5% (400/8000)
Sampling:   6% (500/8000)
Sampling:   8% (600/8000)
Sampling:   9% (700/8000)
Sampling:  10% (800/8000)
Sampling:  11% (900/8000)
Sampling:  12% (1000/8000)
Sampling:  14% (1100/8000)
Sampling:  15% (1200/8000)
Sampling:  16% (1300/8000)
Sampling:  18% (1400/8000)
Sampling:  19% (1500/8000)
Sampling:  20% (1600/8000)
Sampling:  21% (1700/8000)
Sampling:  22% (1800/8000)
Sampling:  24% (1900/8000)
Sampling:  25% (2000/8000)
Sampling:  26% (2100/8000)
Sampling:  28% (2200/8000)
Sampling:  29% (2300/8000)
Sampling:  30% (2400/8000)
Sampling:  31% (2500/8000)
Sampling:  32% (2600/8000)
Sampling:  34% (2700/8000)
Sampling:  35% (2800/8000)
Sampling:  36% (2900/8000)
Sampling:  38% (3000/8000)
Sampling:  39% (3100/8000)
Sampling:  40% (3200/8000)
Sampling:  41% (3300/8000)
Samplin

In [9]:
file_path = "stan/pooled.stan"

# Read the Stan code from the file
with open(file_path, "r") as file:
    stan_code = file.read()

warnings.filterwarnings("ignore")

posterior2 = stan.build(stan_code, data=data)

Building...



Building: found in cache, done.

In [10]:
fit_pooled = posterior2.sample(num_chains=4, num_samples=1000)
pooled_df = fit_pooled.to_frame()  # pandas `DataFrame, requires pandas

Sampling:   0%
Sampling:   2% (200/8000)
Sampling:   8% (600/8000)
Sampling:  12% (1000/8000)
Sampling:  19% (1500/8000)
Sampling:  41% (3300/8000)
Sampling:  61% (4900/8000)
Sampling:  81% (6500/8000)
Sampling: 100% (8000/8000)
Sampling: 100% (8000/8000), done.
Messages received during sampling:
  Gradient evaluation took 0.00015 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.5 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000139 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.39 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000138 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.38 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000148 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.48 seconds.
  Adjust your expectations accordingly!


In [11]:
file_path = "stan/hierarchical.stan"

# Read the Stan code from the file
with open(file_path, "r") as file:
    stan_code = file.read()

warnings.filterwarnings("ignore")

posterior3 = stan.build(stan_code, data=data)

Building...



Building: found in cache, done.

In [12]:
fit_hier = posterior3.sample(num_chains=4, num_samples=1000)
hier_df = fit_hier.to_frame()  # pandas `DataFrame, requires pandas

Sampling:   0%
Sampling:   0% (1/8000)
Sampling:   0% (2/8000)
Sampling:   0% (3/8000)
Sampling:   0% (4/8000)
Sampling:   1% (103/8000)
Sampling:   4% (302/8000)
Sampling:   5% (401/8000)
Sampling:   6% (500/8000)
Sampling:  10% (800/8000)
Sampling:  12% (1000/8000)
Sampling:  16% (1300/8000)
Sampling:  20% (1600/8000)
Sampling:  24% (1900/8000)
Sampling:  28% (2200/8000)
Sampling:  31% (2500/8000)
Sampling:  35% (2800/8000)
Sampling:  51% (4100/8000)
Sampling:  68% (5400/8000)
Sampling:  84% (6700/8000)
Sampling: 100% (8000/8000)
Sampling: 100% (8000/8000), done.
Messages received during sampling:
  Gradient evaluation took 0.000225 seconds
  1000 transitions using 10 leapfrog steps per transition would take 2.25 seconds.
  Adjust your expectations accordingly!
  Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
  Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in '/tmp/httpstan_q1kx9v7a/model_v5p5ozjz.

In [13]:
separate_df.describe()

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,alpha.1,alpha.2,alpha.3,...,pred.10,pred.11,pred.12,pred.13,pred.14,pred.15,pred.16,pred.17,pred.18,pred.19
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,385.210645,0.927874,0.057082,6.094,74.632,0.0,-356.678572,0.482389,10.720458,20.957645,...,5.106588,3.663167,2.660582,1.842037,1.216495,0.830099,0.558313,0.410403,0.288629,0.12783
std,5.641093,0.098504,0.005253,0.292721,24.972715,0.0,7.825428,0.03606,0.361658,0.676854,...,0.349823,0.307084,0.188452,0.133538,0.078808,0.063528,0.043995,0.03885,0.031278,0.012322
min,362.541492,0.164372,0.050245,5.0,63.0,0.0,-379.253185,0.323911,9.116037,16.85721,...,3.603366,2.352675,1.71368,1.33362,0.89025,0.565983,0.385704,0.24019,0.179224,0.077216
25%,381.565119,0.907453,0.053548,6.0,63.0,0.0,-362.211788,0.459784,10.475229,20.524419,...,4.875733,3.469489,2.538342,1.754677,1.165739,0.789648,0.529501,0.385116,0.268361,0.119846
50%,385.645042,0.964639,0.056828,6.0,63.0,0.0,-357.149417,0.483215,10.721627,20.958535,...,5.104165,3.663089,2.664507,1.839778,1.217068,0.831061,0.55873,0.41012,0.288621,0.12782
75%,389.1839,0.989217,0.060363,6.0,63.0,0.0,-351.531524,0.505941,10.957394,21.397508,...,5.332622,3.862773,2.781056,1.929375,1.268779,0.871103,0.586899,0.435741,0.308217,0.135835
max,401.560887,1.0,0.064428,7.0,191.0,0.0,-323.196353,0.645731,11.963475,23.998516,...,6.564183,4.755084,3.428428,2.719253,1.502799,1.07843,0.734683,0.571791,0.437529,0.178832


In [14]:
pooled_df.describe()

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,alpha,beta,sigma,...,pred.10,pred.11,pred.12,pred.13,pred.14,pred.15,pred.16,pred.17,pred.18,pred.19
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,-1127.102431,0.932586,0.369325,2.76625,8.819,0.0,1128.60961,9.962546,-0.18584,8.959656,...,5.975191,5.584871,5.701218,5.406076,5.862831,5.618709,5.325438,5.758233,5.597406,5.565929
std,1.291428,0.093517,0.010086,0.734674,4.559214,0.0,1.773489,0.864011,0.070756,0.318835,...,9.142543,8.97936,8.975895,9.107501,9.151741,8.951646,9.144771,8.913201,8.971697,9.056105
min,-1134.849844,0.202417,0.354075,1.0,1.0,0.0,1125.689168,6.558687,-0.433531,7.986856,...,-26.074056,-32.469954,-25.667041,-30.80414,-30.369721,-30.156028,-28.046481,-25.620237,-28.563876,-24.35884
25%,-1127.662005,0.908338,0.363409,2.0,7.0,0.0,1127.296714,9.376821,-0.235235,8.735561,...,-0.190752,-0.531058,-0.223389,-0.505477,-0.407431,-0.393282,-0.682497,-0.219803,-0.393715,-0.619883
50%,-1126.754036,0.970442,0.371791,3.0,7.0,0.0,1128.250419,9.947047,-0.185488,8.951517,...,6.127467,5.674895,5.797027,5.452972,5.981878,5.524956,5.294991,5.771738,5.482086,5.445361
75%,-1126.183939,0.994154,0.377707,3.0,15.0,0.0,1129.492182,10.562867,-0.137206,9.172201,...,11.958788,11.665853,11.672805,11.508887,12.109401,11.684132,11.39565,11.64781,11.696563,11.58684
max,-1125.524224,1.0,0.379644,4.0,31.0,0.0,1139.310264,13.066063,0.059182,10.109166,...,38.240628,36.511353,41.611674,38.551671,40.433073,34.65482,39.422936,36.882297,35.045707,38.603368


In [15]:
hier_df.describe()

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,mu_alpha,mu_beta,sigma_alpha,...,pred.10,pred.11,pred.12,pred.13,pred.14,pred.15,pred.16,pred.17,pred.18,pred.19
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,-144.548474,0.859121,0.250108,4.22275,20.538,0.0,166.008243,9.343712,-0.192814,11.895788,...,5.099442,3.661084,2.629029,1.831988,1.195704,0.827361,0.548748,0.39148,0.277296,0.132919
std,4.890235,0.127231,0.013627,0.423293,7.678626,0.0,6.809935,2.613581,0.068875,2.023057,...,0.835126,0.845446,0.852953,0.847597,0.840358,0.845466,0.859539,0.852879,0.860538,0.852873
min,-176.032005,0.32505,0.239784,3.0,7.0,0.0,146.341586,-0.353955,-0.519044,7.230708,...,2.076204,0.422775,-0.596683,-1.429094,-2.296108,-2.131474,-2.912351,-3.008293,-2.493222,-2.456645
25%,-147.712223,0.77829,0.240037,4.0,15.0,0.0,161.197727,7.671642,-0.237624,10.51045,...,4.555111,3.085359,2.07363,1.266018,0.628104,0.260322,-0.04607,-0.168307,-0.307888,-0.442564
50%,-144.115317,0.891558,0.243769,4.0,15.0,0.0,165.799563,9.350897,-0.193603,11.620388,...,5.096403,3.647823,2.62613,1.831632,1.191475,0.837625,0.545782,0.398216,0.281227,0.124506
75%,-141.116436,0.967579,0.25384,4.0,31.0,0.0,170.394582,11.052749,-0.147389,13.011867,...,5.653217,4.24445,3.189996,2.408929,1.759919,1.394149,1.139059,0.964497,0.860558,0.694316
max,-130.964661,1.0,0.273109,5.0,63.0,0.0,201.755341,19.30032,0.203118,23.281681,...,8.431969,6.335547,6.031428,5.026468,4.179467,4.070036,3.382601,3.29635,3.371238,3.915699


In [16]:
alpha_columns = [col for col in separate_df.columns if col.startswith('alpha')]
beta_columns = [col for col in separate_df.columns if col.startswith('beta')]

# Extract alpha and beta as DataFrames
alpha_samples = separate_df[alpha_columns]
beta_samples = separate_df[beta_columns]

In [17]:
xpred = 2022
base_year = 2000
x_offset = xpred - base_year

# Calculate predictions
beta_samples = beta_samples.multiply(x_offset)



In [18]:
predictions = pd.DataFrame()
for i in range(1, 20):
    predictions[f'pred{i}'] = alpha_samples[f'alpha.{i}'] + beta_samples[f'beta.{i}']

# Summarize predictions
pred_mean = predictions.mean()  # Mean predictions
pred_cred = predictions.quantile([0.025, 0.975])  # 95% credible interval


#predictions

In [19]:
pred_mean # separate model

pred1      0.314357
pred2      7.208410
pred3     12.721467
pred4     11.113556
pred5     18.429114
pred6     16.806615
pred7     11.993738
pred8      8.652260
pred9      6.703774
pred10     5.172845
pred11     3.693499
pred12     2.666327
pred13     1.840062
pred14     1.217758
pred15     0.830181
pred16     0.562082
pred17     0.414797
pred18     0.293091
pred19     0.130723
dtype: float64

In [20]:
pred_cred # separate model

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14,pred15,pred16,pred17,pred18,pred19
0.025,0.237203,6.466529,11.311782,9.615037,16.815932,15.867275,11.367754,7.980869,6.286267,4.893896,3.452619,2.519434,1.735076,1.152977,0.781302,0.52648,0.384882,0.268578,0.120469
0.975,0.391111,7.975619,14.190289,12.622937,20.023306,17.776443,12.644735,9.285081,7.122938,5.459793,3.934217,2.815188,1.943657,1.281932,0.878813,0.599254,0.445768,0.317278,0.140529


In [21]:
alpha_columns = [col for col in pooled_df.columns if col.startswith('alpha')]
beta_columns = [col for col in pooled_df.columns if col.startswith('beta')]

# Extract alpha and beta as DataFrames
alpha_samples = pooled_df[alpha_columns]
beta_samples = pooled_df[beta_columns]

In [22]:
xpred = 2022
base_year = 2000
x_offset = xpred - base_year

# Calculate predictions
beta_samples = beta_samples.multiply(x_offset)

In [23]:
predictions = pd.DataFrame()

predictions['pred'] = alpha_samples['alpha'] + beta_samples['beta']

# Summarize predictions
pred_mean = predictions.mean()  # Mean predictions
pred_cred = predictions.quantile([0.025, 0.975])  # 95% credible interval

In [24]:
pred_mean # pooled model

pred    5.874063
dtype: float64

In [25]:
pred_cred # pooled model

Unnamed: 0,pred
0.025,4.086727
0.975,7.718875


In [26]:
alpha_columns = [col for col in hier_df.columns if col.startswith('alpha')]
beta_columns = [col for col in hier_df.columns if col.startswith('beta')]

# Extract alpha and beta as DataFrames
alpha_samples = hier_df[alpha_columns]
beta_samples = hier_df[beta_columns]

xpred = 2022
base_year = 2000
x_offset = xpred - base_year

# Calculate predictions
beta_samples = beta_samples.multiply(x_offset)

predictions = pd.DataFrame()
for i in range(1, 20):
    predictions[f'pred{i}'] = alpha_samples[f'alpha.{i}'] + beta_samples[f'beta.{i}']

# Summarize predictions
pred_mean = predictions.mean()  # Mean predictions
pred_cred = predictions.quantile([0.025, 0.975])  # 95% credible interval

In [27]:
pred_mean # hierarchical model

pred1      0.294507
pred2      7.188147
pred3     12.704763
pred4     11.098465
pred5     18.418469
pred6     16.818643
pred7     12.005580
pred8      8.662926
pred9      6.693852
pred10     5.162261
pred11     3.677358
pred12     2.643092
pred13     1.816508
pred14     1.195727
pred15     0.813084
pred16     0.545336
pred17     0.393141
pred18     0.275329
pred19     0.111552
dtype: float64

In [28]:
pred_cred # hierarchical model

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14,pred15,pred16,pred17,pred18,pred19
0.025,-0.36571,6.515769,12.055703,10.420827,17.745545,16.150397,11.327978,7.986342,6.054061,4.48872,3.02916,1.976877,1.168567,0.530794,0.133946,-0.126598,-0.276632,-0.384693,-0.542956
0.975,0.961287,7.840042,13.341418,11.757603,19.06657,17.471339,12.668312,9.319983,7.364587,5.815545,4.321705,3.32039,2.480963,1.849178,1.478378,1.214335,1.069379,0.939771,0.765987
