In [2]:
import pymc3 as pm
import pandas as pd
import numpy as np
import theano.tensor as T
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
from theano import shared
import theano

In [3]:
df = pd.read_csv("../data/dmc2001_learn.txt", sep=";")
df = df.fillna(df.mean(numeric_only=True))
# Preprocessing: let's assume all your columns are numerical for simplicity
df['WO'] = df['WO'].replace({'W': 1, 'O': 0, 'F': 2})
train, test = train_test_split(df, test_size=0.2)

In [4]:
def create_shared_data(data):
    shared_data = {}
    for column in data.columns:
        shared_data[column] = theano.shared(data[column].values)
    return shared_data

# Convert your DataFrame to shared variables
shared_train_data = create_shared_data(train.drop('AKTIV', axis=1))
shared_test_data = create_shared_data(test.drop('AKTIV', axis=1))

with pm.Model() as model:
    # Priors for each feature
    priors = {col: pm.Normal(col, mu=0, sd=1) for col in train.columns if col != "AKTIV"}

    # Expected value using logistic function
    mu = pm.math.sigmoid(sum([priors[col] * shared_train_data[col] for col in train.columns if col != "AKTIV"]))

    # Likelihood
    AKTIV = pm.Bernoulli('AKTIV', p=mu, observed=train['AKTIV'])

    # Sample
    trace = pm.sample(2000, tune=1000)

# Switch to test data and generate posterior predictive samples
for column in test.columns:
    if column in shared_test_data:
        shared_train_data[column].set_value(test[column].values)

with model:
    ppc = pm.sample_posterior_predictive(trace, samples=500)

  trace = pm.sample(2000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [PHARM6, PHARM5, PHARM4, PHARM3, PHARM2, PHARM1, Abogr, Typ9, Typ8, Typ7, Typ6, Typ5, Typ4, Typ3, Typ2, Typ1, PKW_Gel, PKW_GW, PKW_KB, PKW_Lei, PKW_Di, AnzGew, AnzHH, AntDt, Altersgr, Famgr, Bonitaet, Strtyp, Bebautyp, Kaufkraft, Regiotyp, WO, jahrstart, ID]


Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1337 seconds.
The acceptance probability does not match the target. It is 0.9537328193030185, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9896686210272237, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9886494212719547, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9535538753161391, but should be close to 0.8. Try to increase the number of tuning steps.


In [5]:
prediction =  np.mean(ppc["AKTIV"], axis=0)
prediction

array([0.542, 0.676, 0.604, ..., 0.448, 0.626, 0.582])

In [6]:
pred = [1 if x > 0.5 else 0 for x in prediction] 
lables_test = test["AKTIV"].to_list()

In [7]:
correct_predictions = sum(p == l for p, l in zip(pred, lables_test))
accuracy = correct_predictions / len(lables_test) * 100
accuracy

63.349999999999994

In [8]:
def cost_matrix(prediction, labels):
    true_positive = 1.100
    false_positive = -265
    true_negative = -25
    false_negative = 662
    total_cost = 0
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    for pred, label in zip(prediction, labels):
        if pred == 1 and label == 1:
            total_cost += true_positive
            true_positive += 1
        elif pred == 1 and label == 0:
            total_cost += false_positive
            false_positive += 1
        elif pred == 0 and label == 0:
            total_cost += true_negative
            true_negative += 1
        elif pred == 0 and label == 1:
            total_cost += false_negative
            false_negative += 1
        else:
            raise ValueError(f"pred {pred} label {label}")
    
    print(f"true_positive {true_positive} false_positive {false_positive} true_negative {true_negative} false_negative {false_negative}")
    return total_cost, true_positive, false_positive, true_negative, false_negative

In [9]:
cost_matrix(pred, lables_test)

true_positive 743 false_positive 471 true_negative 524 false_negative 262


(557555, 743, 471, 524, 262)