In [1]:
import os
os.environ["PYTENSOR_FLAGS"] = "linker=py,cxx="

In [2]:
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import pytensor.tensor as pt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

In [3]:
df = pd.read_csv('Final/pima.csv')
df = df.replace('NA', np.nan)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
print("Data types after conversion:")
print(df.dtypes)
print("\nMissing values per column:")
print(df.isnull().sum())

Data types after conversion:
pregnant       int64
glucose      float64
diastolic    float64
triceps      float64
insulin      float64
bmi          float64
diabetes     float64
age            int64
test           int64
dtype: object

Missing values per column:
pregnant       0
glucose        5
diastolic     35
triceps      227
insulin      374
bmi           11
diabetes       0
age            0
test           0
dtype: int64


In [4]:
predictors = df.columns.drop('test')
X = df[predictors]
y = df['test']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=predictors)
X_imputed_mean = X_scaled_df.fillna(0)
print("\nStandardized and Mean-Imputed Data Head:")
print(X_imputed_mean.head())
print("\nCheck for remaining NaNs:", X_imputed_mean.isnull().sum().sum())


Standardized and Mean-Imputed Data Head:
   pregnant   glucose  diastolic   triceps   insulin       bmi  diabetes  \
0  0.639947  0.862287  -0.032746  0.558557  0.000000  0.165097  0.468492   
1 -0.844885 -1.202229  -0.517645 -0.014657  0.000000 -0.846404 -0.365061   
2  1.233880  2.009241  -0.679278  0.000000  0.000000 -1.323254  0.604397   
3 -0.844885 -1.071148  -0.517645 -0.587871 -0.518847 -0.629654 -0.920763   
4 -1.141852  0.501816  -2.618874  0.558557  0.104968  1.537847  5.484909   

        age  
0  1.425995  
1 -0.190672  
2 -0.105584  
3 -1.041549  
4 -0.020496  

Check for remaining NaNs: 0


In [5]:
coords_part1 = {"predictor": predictors}
with pm.Model(coords=coords_part1) as model_part1:
    X_data = pm.Data("X_data", X_imputed_mean.values, mutable=False)
    y_data = pm.Data("y_data", y.values, mutable=False)
    intercept = pm.Normal("intercept", mu=0, sigma=10)
    beta = pm.Normal("beta", mu=0, sigma=2.5, dims="predictor")
    mu = intercept + pt.dot(X_data, beta)
    likelihood = pm.Bernoulli("likelihood", logit_p=mu, observed=y_data)

In [6]:
with model_part1:
    idata_part1 = pm.sample(2000, tune=1000, cores=1)

Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [intercept, beta]


Output()

KeyboardInterrupt: 

In [None]:
summary_part1 = az.summary(idata_part1, var_names=["intercept", "beta"], hdi_prob=0.95)
print("Posterior Summary (Part 1 - Mean Imputation):")
print(summary_part1)
print("\nInterpretation of Significance (HDI excludes 0):")
significant_vars_part1 = summary_part1[(summary_part1['hdi_2.5%'] > 0) | (summary_part1['hdi_97.5%'] < 0)]
print(significant_vars_part1)

In [7]:
coords_part2 = {"predictor": predictors, "obs_id": df.index}
n_predictors = X_scaled_df.shape[1]
with pm.Model(coords=coords_part2) as model_part2:
    mu_imp = pm.Normal("mu_imp", mu=0, sigma=1, dims="predictor")
    sigma_imp = pm.HalfNormal("sigma_imp", sigma=1, dims="predictor")
    X_imputed = pm.Normal(
        "X_imputed",
        mu=mu_imp,
        sigma=sigma_imp,
        observed=X_scaled_df.values,
        dims=("obs_id", "predictor")
    )
    intercept = pm.Normal("intercept", mu=0, sigma=10)
    beta = pm.Normal("beta", mu=0, sigma=2.5, dims="predictor")
    mu = intercept + pt.dot(X_imputed, beta)
    likelihood = pm.Bernoulli("likelihood", logit_p=mu, observed=y.values, dims="obs_id")

In [8]:
with model_part2:
    idata_part2 = pm.sample(2000, tune=1000, cores=1, target_accept=0.9)

Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [mu_imp, sigma_imp, X_imputed_unobserved, intercept, beta]


Output()

: 

: 

In [None]:
summary_part2 = az.summary(idata_part2, var_names=["intercept", "beta"], hdi_prob=0.95)
print("Posterior Summary (Part 2 - Bayesian Imputation):")
print(summary_part2)
print("\nInterpretation of Significance (HDI excludes 0):")
significant_vars_part2 = summary_part2[(summary_part2['hdi_2.5%'] > 0) | (summary_part2['hdi_97.5%'] < 0)]
print(significant_vars_part2)
comparison = pd.DataFrame({
    'Mean_Part1': summary_part1['mean'],
    'Mean_Part2': summary_part2['mean'],
    'SD_Part1': summary_part1['sd'],
    'SD_Part2': summary_part2['sd']
})
print("\nComparison of Coefficients (Mean Imputation vs Bayesian Imputation):")
print(comparison)

In [None]:
coords_part3 = {"predictor": predictors, "obs_id": df.index}
n_predictors = X_scaled_df.shape[1]
tau_0 = 0.01
tau_1 = 2.5
with pm.Model(coords=coords_part3) as model_part3:
    mu_imp = pm.Normal("mu_imp", mu=0, sigma=1, dims="predictor")
    sigma_imp = pm.HalfNormal("sigma_imp", sigma=1, dims="predictor")
    X_imputed = pm.Normal(
        "X_imputed",
        mu=mu_imp,
        sigma=sigma_imp,
        observed=X_scaled_df.values,
        dims=("obs_id", "predictor")
    )
    delta = pm.Bernoulli("delta", p=0.5, dims="predictor")
    sigma_beta = pm.math.switch(pt.eq(delta, 1), tau_1, tau_0)
    beta = pm.Normal("beta", mu=0, sigma=sigma_beta, dims="predictor")
    intercept = pm.Normal("intercept", mu=0, sigma=10)
    mu = intercept + pt.dot(X_imputed, beta)
    likelihood = pm.Bernoulli("likelihood", logit_p=mu, observed=y.values, dims="obs_id")

In [None]:
with model_part3:
    idata_part3 = pm.sample(2000, tune=1000, cores=1, target_accept=0.99)

In [None]:
pip = idata_part3.posterior["delta"].mean(dim=("chain", "draw"))
pip_df = pip.to_dataframe().rename(columns={'delta': 'Inclusion Probability'})
print("Posterior Inclusion Probabilities (PIP):")
print(pip_df.sort_values(by='Inclusion Probability', ascending=False))
delta_samples = idata_part3.posterior["delta"].stack(sample=("chain", "draw")).values.T
model_configs = [tuple(row) for row in delta_samples]
from collections import Counter
model_counts = Counter(model_configs)
total_samples = delta_samples.shape[0]
model_probs = {config: count / total_samples for config, count in model_counts.items()}
sorted_models = sorted(model_probs.items(), key=lambda item: item[1], reverse=True)
print("\nTop 5 Models by Posterior Probability:")
for i, (config, prob) in enumerate(sorted_models[:5]):
    included_vars = [predictors[j] for j, included in enumerate(config) if included == 1]
    print(f"Model {i+1} (Prob: {prob:.4f}):")
    print(f"  Included: {', '.join(included_vars) if included_vars else 'None (Intercept Only)'}")
    print(f"  Config: {config}")