In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import stan # python3 -m pip install pystan
import nest_asyncio
nest_asyncio.apply()
import warnings
import arviz as az

In [2]:
# Define file paths
numbers_path = "dataframes/numbers.csv"
percentages_path = "dataframes/precentages.csv"
death_rates_path = "dataframes/death_rates.csv"

In [3]:
# Read the percentages CSV file
df = pd.read_csv(percentages_path, delimiter=';', header=0)
df = df.head(22)
df = df.drop(columns="[All]")
df.dtypes

Year       object
[0]        object
[1-4]      object
[5-9]      object
[10-14]    object
[15-19]    object
[20-24]    object
[25-29]    object
[30-34]    object
[35-39]    object
[40-44]    object
[45-49]    object
[50-54]    object
[55-59]    object
[60-64]    object
[65-69]    object
[70-74]    object
[75-79]    object
[80-84]    object
[85+]      object
dtype: object

In [4]:
df = df.map(lambda x: float(str(x).replace(',', '.')) if isinstance(x, str) else x)
df = df.map(lambda x: pd.to_numeric(str(x).replace(',', '.'), errors='coerce'))
df["Year"] = df["Year"].astype(int)
df = df.set_index("Year")

In [5]:
df = df.T

In [6]:
data = dict(
    N = 19,
    Y = 22,
    accidentData = df.values,
    xpred = 2022)

In [7]:
file_path = "stan/separate.stan"

# Read the Stan code from the file
with open(file_path, "r") as file:
    stan_code = file.read()

posterior1 = stan.build(stan_code, data=data)

Building...



Building: found in cache, done.

In [8]:
fit_separate = posterior1.sample(num_chains=4, num_samples=1000)
separate_df = fit_separate.to_frame()  # pandas `DataFrame, requires pandas

Sampling:   0%
Sampling:   0% (1/8000)
Sampling:   0% (2/8000)
Sampling:   0% (3/8000)
Sampling:   0% (4/8000)
Sampling:   1% (103/8000)
Sampling:   3% (202/8000)
Sampling:   4% (301/8000)
Sampling:   5% (400/8000)
Sampling:   6% (500/8000)
Sampling:   8% (600/8000)
Sampling:   9% (700/8000)
Sampling:  10% (800/8000)
Sampling:  11% (900/8000)
Sampling:  12% (1000/8000)
Sampling:  14% (1100/8000)
Sampling:  15% (1200/8000)
Sampling:  16% (1300/8000)
Sampling:  18% (1400/8000)
Sampling:  19% (1500/8000)
Sampling:  20% (1600/8000)
Sampling:  21% (1700/8000)
Sampling:  22% (1800/8000)
Sampling:  24% (1900/8000)
Sampling:  25% (2000/8000)
Sampling:  26% (2100/8000)
Sampling:  28% (2200/8000)
Sampling:  29% (2300/8000)
Sampling:  30% (2400/8000)
Sampling:  31% (2500/8000)
Sampling:  32% (2600/8000)
Sampling:  34% (2700/8000)
Sampling:  35% (2800/8000)
Sampling:  36% (2900/8000)
Sampling:  38% (3000/8000)
Sampling:  40% (3200/8000)
Sampling:  44% (3501/8000)
Sampling:  59% (4701/8000)
Samplin

In [9]:
file_path = "stan/pooled.stan"

# Read the Stan code from the file
with open(file_path, "r") as file:
    stan_code = file.read()

warnings.filterwarnings("ignore")

posterior2 = stan.build(stan_code, data=data)

Building...



Building: found in cache, done.

In [10]:
fit_pooled = posterior2.sample(num_chains=4, num_samples=1000)
pooled_df = fit_pooled.to_frame()  # pandas `DataFrame, requires pandas

Sampling:   0%
Sampling:   6% (500/8000)
Sampling:  16% (1300/8000)
Sampling:  41% (3300/8000)
Sampling:  66% (5300/8000)
Sampling:  85% (6800/8000)
Sampling: 100% (8000/8000)
Sampling: 100% (8000/8000), done.
Messages received during sampling:
  Gradient evaluation took 0.000102 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.02 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000169 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.69 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000188 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.88 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000151 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.51 seconds.
  Adjust your expectations accordingly!


In [11]:
file_path = "stan/hierarchical.stan"

# Read the Stan code from the file
with open(file_path, "r") as file:
    stan_code = file.read()

warnings.filterwarnings("ignore")

posterior3 = stan.build(stan_code, data=data)

Building...



Building: found in cache, done.

In [12]:
fit_hier = posterior3.sample(num_chains=4, num_samples=1000)
hier_df = fit_hier.to_frame()  # pandas `DataFrame, requires pandas

Sampling:   0%
Sampling:   0% (1/8000)
Sampling:   0% (2/8000)
Sampling:   0% (3/8000)
Sampling:   0% (4/8000)
Sampling:   1% (103/8000)
Sampling:   4% (302/8000)
Sampling:   6% (501/8000)
Sampling:  13% (1001/8000)
Sampling:  18% (1400/8000)
Sampling:  22% (1800/8000)
Sampling:  28% (2200/8000)
Sampling:  45% (3600/8000)
Sampling:  65% (5200/8000)
Sampling:  82% (6600/8000)
Sampling: 100% (8000/8000)
Sampling: 100% (8000/8000), done.
Messages received during sampling:
  Gradient evaluation took 0.000196 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.96 seconds.
  Adjust your expectations accordingly!
  Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
  Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in '/tmp/httpstan_4piyte4b/model_r2cmmh2u.stan', line 37, column 2 to column 40)
  Gradient evaluation took 0.000231 seconds
  1000 transitions using 10 leapfrog steps per tra

In [13]:
separate_df.describe()

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,alpha.1,alpha.2,alpha.3,...,pred.10,pred.11,pred.12,pred.13,pred.14,pred.15,pred.16,pred.17,pred.18,pred.19
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,387.760903,0.933836,0.056053,6.0735,75.688,0.0,-359.357798,0.483495,10.730928,20.96247,...,5.08713,3.668093,2.661414,1.838712,1.218038,0.830482,0.558347,0.410062,0.289143,0.128493
std,5.721409,0.089935,0.004784,0.263847,25.628946,0.0,7.808221,0.036784,0.370774,0.675006,...,0.353212,0.304631,0.184804,0.136457,0.076929,0.062807,0.044375,0.03886,0.030603,0.012197
min,364.036313,0.258184,0.051568,5.0,31.0,0.0,-384.669634,0.331118,8.967985,18.065758,...,3.445212,2.359363,1.97526,1.164705,0.905607,0.568272,0.386872,0.262164,0.168659,0.079736
25%,384.149584,0.914358,0.052634,6.0,63.0,0.0,-364.853121,0.459803,10.49244,20.522306,...,4.866003,3.47454,2.540117,1.749857,1.167538,0.789388,0.528896,0.384941,0.269142,0.120551
50%,388.145027,0.967189,0.05436,6.0,63.0,0.0,-359.771794,0.483316,10.732321,20.963379,...,5.094035,3.659778,2.663149,1.841435,1.218352,0.83007,0.55758,0.4094,0.289231,0.128503
75%,391.782621,0.990174,0.057779,6.0,63.0,0.0,-354.129597,0.507567,10.966043,21.416908,...,5.315511,3.869626,2.78078,1.927601,1.267258,0.87058,0.587948,0.435674,0.309294,0.136659
max,405.786394,1.0,0.063922,7.0,191.0,0.0,-327.600492,0.632926,12.302495,23.423007,...,6.424089,5.179233,3.44319,2.344306,1.522349,1.12647,0.753018,0.588376,0.415098,0.181322


In [14]:
pooled_df.describe()

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,alpha,beta,sigma,...,pred.10,pred.11,pred.12,pred.13,pred.14,pred.15,pred.16,pred.17,pred.18,pred.19
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,-1126.895346,0.92564,0.385383,2.6985,8.4125,0.0,1128.40842,9.994929,-0.189224,8.957664,...,5.701459,5.764107,5.488031,5.483444,5.756873,5.731703,5.688482,5.42838,5.293208,5.527151
std,1.272597,0.100945,0.005252,0.731253,4.408254,0.0,1.787982,0.867942,0.070459,0.308342,...,8.956127,8.922373,9.123283,9.106468,8.814626,8.857136,8.946144,9.145095,9.10103,8.964911
min,-1136.773356,0.21543,0.37949,1.0,1.0,0.0,1125.482684,7.127516,-0.447297,7.85965,...,-27.867746,-25.029257,-32.070693,-29.632152,-26.782236,-26.151705,-25.354796,-27.076778,-35.085273,-24.589864
25%,-1127.450815,0.894804,0.380899,2.0,7.0,0.0,1127.094115,9.414281,-0.237466,8.745578,...,-0.456414,-0.196287,-0.534709,-0.539261,-0.305619,-0.295028,-0.314727,-0.697335,-0.880575,-0.464105
50%,-1126.557008,0.967286,0.384713,3.0,7.0,0.0,1128.043459,10.017203,-0.190157,8.945168,...,5.636144,5.912122,5.597874,5.439774,5.822162,5.756587,5.809179,5.439114,5.253303,5.697345
75%,-1125.98023,0.993719,0.389197,3.0,15.0,0.0,1129.353642,10.579016,-0.14252,9.151749,...,11.856316,11.714741,11.407368,11.69948,11.670844,11.82181,11.78432,11.599064,11.354815,11.57397
max,-1125.384138,1.0,0.392616,4.0,15.0,0.0,1139.785906,13.164707,0.029321,10.277763,...,36.461099,39.343265,36.698032,38.726664,34.142016,35.626135,38.897692,42.602863,42.867777,38.109577


In [15]:
hier_df.describe()

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,mu_alpha,mu_beta,sigma_alpha,...,pred.10,pred.11,pred.12,pred.13,pred.14,pred.15,pred.16,pred.17,pred.18,pred.19
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,-144.668949,0.855815,0.257951,4.1425,18.588,0.0,166.183992,9.436416,-0.191268,11.974595,...,5.084416,3.62586,2.647895,1.795852,1.208035,0.816477,0.522467,0.395358,0.251258,0.099403
std,4.823368,0.129905,0.010964,0.352455,6.683818,0.0,6.650029,2.743554,0.071157,2.080245,...,0.867845,0.830467,0.859297,0.849057,0.859079,0.856161,0.855016,0.851227,0.843689,0.857826
min,-163.652162,0.267191,0.239013,3.0,7.0,0.0,146.116402,-1.725786,-0.537061,7.536967,...,1.934195,0.493558,-0.144169,-1.922986,-2.037025,-2.376182,-2.739222,-2.926084,-3.159872,-2.979542
25%,-147.748752,0.772332,0.257339,4.0,15.0,0.0,161.471345,7.696372,-0.236201,10.505468,...,4.505654,3.064886,2.076377,1.232167,0.627061,0.239808,-0.061243,-0.158441,-0.328136,-0.487153
50%,-144.384402,0.887284,0.263621,4.0,15.0,0.0,165.9647,9.512443,-0.190878,11.704417,...,5.092919,3.628735,2.642931,1.782182,1.204202,0.813875,0.533648,0.401898,0.245624,0.101734
75%,-141.173754,0.968952,0.264233,4.0,15.0,0.0,170.499201,11.280826,-0.146333,13.1535,...,5.679749,4.183193,3.211948,2.374407,1.772628,1.397679,1.102837,0.955523,0.817967,0.690662
max,-131.47634,1.0,0.265547,5.0,31.0,0.0,192.774297,21.300762,0.070308,23.946583,...,8.628996,6.594592,5.801034,4.897973,4.446235,3.912888,3.833992,3.439895,3.043234,3.093775


In [16]:
alpha_columns = [col for col in separate_df.columns if col.startswith('alpha')]
beta_columns = [col for col in separate_df.columns if col.startswith('beta')]

# Extract alpha and beta as DataFrames
alpha_samples = separate_df[alpha_columns]
beta_samples = separate_df[beta_columns]

In [17]:
xpred = 2022
base_year = 2000
x_offset = xpred - base_year

# Calculate predictions
beta_samples = beta_samples.multiply(x_offset)



In [18]:
predictions = pd.DataFrame()
for i in range(1, 20):
    predictions[f'pred{i}'] = alpha_samples[f'alpha.{i}'] + beta_samples[f'beta.{i}']

# Summarize predictions
pred_mean = predictions.mean()  # Mean predictions
pred_cred = predictions.quantile([0.025, 0.975])  # 95% credible interval


#predictions

In [19]:
pred_mean # separate model

pred1      0.315038
pred2      7.194692
pred3     12.718583
pred4     11.107383
pred5     18.446414
pred6     16.788521
pred7     11.987256
pred8      8.661813
pred9      6.703978
pred10     5.164567
pred11     3.696857
pred12     2.665049
pred13     1.838735
pred14     1.218144
pred15     0.829753
pred16     0.562052
pred17     0.414977
pred18     0.293135
pred19     0.130653
dtype: float64

In [20]:
pred_cred # separate model

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14,pred15,pred16,pred17,pred18,pred19
0.025,0.235499,6.425488,11.347314,9.651078,16.863564,15.825084,11.304877,7.982203,6.291961,4.882521,3.465664,2.51555,1.735762,1.155547,0.78196,0.528588,0.385211,0.268503,0.120972
0.975,0.390794,7.983963,14.115801,12.560395,20.124066,17.772976,12.614383,9.345719,7.135329,5.434888,3.939283,2.81969,1.944465,1.279453,0.877228,0.59637,0.445849,0.318591,0.14021


In [21]:
alpha_columns = [col for col in pooled_df.columns if col.startswith('alpha')]
beta_columns = [col for col in pooled_df.columns if col.startswith('beta')]

# Extract alpha and beta as DataFrames
alpha_samples = pooled_df[alpha_columns]
beta_samples = pooled_df[beta_columns]

In [22]:
xpred = 2022
base_year = 2000
x_offset = xpred - base_year

# Calculate predictions
beta_samples = beta_samples.multiply(x_offset)

In [23]:
predictions = pd.DataFrame()

predictions['pred'] = alpha_samples['alpha'] + beta_samples['beta']

# Summarize predictions
pred_mean = predictions.mean()  # Mean predictions
pred_cred = predictions.quantile([0.025, 0.975])  # 95% credible interval

In [24]:
pred_mean # pooled model

pred    5.831993
dtype: float64

In [25]:
pred_cred # pooled model

Unnamed: 0,pred
0.025,3.981998
0.975,7.620365


In [26]:
alpha_columns = [col for col in hier_df.columns if col.startswith('alpha')]
beta_columns = [col for col in hier_df.columns if col.startswith('beta')]

# Extract alpha and beta as DataFrames
alpha_samples = hier_df[alpha_columns]
beta_samples = hier_df[beta_columns]

xpred = 2022
base_year = 2000
x_offset = xpred - base_year

# Calculate predictions
beta_samples = beta_samples.multiply(x_offset)

predictions = pd.DataFrame()
for i in range(1, 20):
    predictions[f'pred{i}'] = alpha_samples[f'alpha.{i}'] + beta_samples[f'beta.{i}']

# Summarize predictions
pred_mean = predictions.mean()  # Mean predictions
pred_cred = predictions.quantile([0.025, 0.975])  # 95% credible interval

In [27]:
pred_mean # hierarchical model

pred1      0.297630
pred2      7.187304
pred3     12.692573
pred4     11.088578
pred5     18.410927
pred6     16.821387
pred7     12.001555
pred8      8.663718
pred9      6.709884
pred10     5.154111
pred11     3.682826
pred12     2.648372
pred13     1.802119
pred14     1.198930
pred15     0.807731
pred16     0.535088
pred17     0.389041
pred18     0.258452
pred19     0.107755
dtype: float64

In [28]:
pred_cred # hierarchical model

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14,pred15,pred16,pred17,pred18,pred19
0.025,-0.376725,6.507519,12.024012,10.42179,17.731898,16.141664,11.345872,8.00494,6.043005,4.471468,3.032978,2.015571,1.150474,0.543628,0.14682,-0.12184,-0.258192,-0.38887,-0.569685
0.975,0.961916,7.877553,13.329275,11.765795,19.082204,17.483268,12.674516,9.318781,7.39003,5.83086,4.346625,3.298967,2.476754,1.875368,1.467644,1.182846,1.050611,0.904623,0.786214


In [29]:
az.summary(fit_hier)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
mu_alpha,9.436,2.744,4.390,14.717,0.036,0.026,5895.0,2827.0,1.0
mu_beta,-0.191,0.071,-0.326,-0.059,0.001,0.001,5448.0,2061.0,1.0
sigma_alpha,11.975,2.080,8.321,15.679,0.030,0.022,6029.0,2958.0,1.0
sigma_beta,0.295,0.053,0.206,0.394,0.001,0.001,6106.0,2777.0,1.0
sigma,0.770,0.028,0.717,0.822,0.000,0.000,6175.0,2612.0,1.0
...,...,...,...,...,...,...,...,...,...
pred[14],0.816,0.856,-0.778,2.367,0.013,0.010,4056.0,3971.0,1.0
pred[15],0.522,0.855,-1.005,2.190,0.013,0.010,4088.0,3950.0,1.0
pred[16],0.395,0.851,-1.210,1.999,0.014,0.010,3763.0,3478.0,1.0
pred[17],0.251,0.844,-1.335,1.790,0.013,0.010,4025.0,3952.0,1.0
